diff --git a/config.py b/config.py
index b78e5a9b8f95ce49b50bfc24c2103a1a10844eb5..b1fd1e5230ce8bd4590ae0a8c5760e8aa03fdf19 100644
--- a/config.py
+++ b/config.py
@@ -23,7 +23,9 @@ def create_parser():
     # The first arg parser parses out only the --config argument, this argument is used to
     # load a yaml file containing key-values that override the defaults for the main parser below
     parser_config = argparse.ArgumentParser(description='Training Config', add_help=False)
-    parser_config.add_argument('-c', '--config', type=str, default='',
+    parser_config.add_argument('-c', '--config', type=str,
+                               default='/home/work/user-job-dir/V0001/configs/levit/levit_128s_new_4NPU.yaml',
+                               # add default
                                help='YAML config file specifying default arguments (default="")')
 
     # The main parser. It inherits the --config argument for better help information.
@@ -94,7 +96,7 @@ def create_parser():
                             'Example: "randaug-m10-n2-w0-mstd0.5-mmax10-inc0", "autoaug-mstd0.5" or autoaugr-mstd0.5.')
     group.add_argument('--aug_splits', type=int, default=0,
                        help='Number of augmentation splits (default: 0, valid: 3 (currently, only support 3 splits))'
-                       'it should be set with one auto_augment')
+                            'it should be set with one auto_augment')
     group.add_argument('--re_prob', type=float, default=0.0,
                        help='Probability of performing erasing (default=0.0)')
     group.add_argument('--re_scale', type=tuple, default=(0.02, 0.33),
@@ -269,8 +271,33 @@ def create_parser():
                        help='pre_train_model path in obs')
     group.add_argument('--train_url', type=str, default='/cache/output/',
                        help='model folder to save/load')
+    group.add_argument('--pretrain_url', type=str, default='/cache/data/',
+                       help='model pretrained to load')
+    group.add_argument('--model_url', type=str, default='/cache/output/',
+                       help='path to model')
+    group.add_argument('--grampus_code_file_name', type=str, default='',
+                       help='code file name')
+    # add teacher model
+    group.add_argument('--name_for_distillation', type=str, default='distillation_for_levit',
+                       help='distillation_for_levi')
+    parser.add_argument('--teacher_model', default='regnety_160', type=str,
+                        choices=['regnety_160'],
+                        help='Name of teacher model to train '
+                             '(default: "regnety_160"')
+    parser.add_argument('--teacher_path', type=str, default='')
+    parser.add_argument('--distillation_type', default='none',
+                        choices=['none', 'soft', 'hard'], type=str, help="")
+    parser.add_argument('--distillation_alpha', default=0.5, type=float,
+                        help="")
+    parser.add_argument('--distillation_tau', default=1.0, type=float, help="")
+    parser.add_argument('--bce_loss', action='store_true')
+    parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None,
+                        help='cutmix min/max ratio, overrides '
+                             'alpha and enables cutmix if set (default: None)')
 
     return parser_config, parser
+
+
 # fmt: on
 
 
diff --git a/configs/levit/levit_128s_GPU.yaml b/configs/levit/levit_128s_GPU.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f3f9bdc1e475bfb8a6217804870979ed46e66f5
--- /dev/null
+++ b/configs/levit/levit_128s_GPU.yaml
@@ -0,0 +1,55 @@
+# system config
+mode: 0
+distribute: True
+num_parallel_workers: 8
+
+# dataset config
+dataset: 'imagenet'
+data_dir: ''
+shuffle: True
+dataset_download: False
+batch_size: 256
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: './'
+epoch_size: 300
+dataset_sink_mode: True
+amp_level: 'O2'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 250
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
\ No newline at end of file
diff --git a/configs/levit/levit_128s_ascend.yaml b/configs/levit/levit_128s_ascend.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..88a18587c9a746399bb50edbd8a1ef9ff62a66e4
--- /dev/null
+++ b/configs/levit/levit_128s_ascend.yaml
@@ -0,0 +1,55 @@
+# system config
+mode: 0
+distribute: True
+num_parallel_workers: 8
+
+# dataset config
+dataset: 'imagenet'
+data_dir: './imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 256
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: './ckpt/'
+epoch_size: 300
+dataset_sink_mode: True
+amp_level: 'O3'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 30
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
\ No newline at end of file
diff --git a/configs/levit/levit_128s_ascend_v2.yaml b/configs/levit/levit_128s_ascend_v2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e5206cd98095a91d1e3ec710191bb87360993ea3
--- /dev/null
+++ b/configs/levit/levit_128s_ascend_v2.yaml
@@ -0,0 +1,72 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# system config
+mode: 0
+distribute: True
+num_parallel_workers: 8
+val_while_train: True
+
+# dataset config
+dataset: 'imagenet'
+data_dir: './imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 256
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: './ckpt/'
+epoch_size: 1000
+dataset_sink_mode: True
+amp_level: 'O2'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 345
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
+eps: 1e-8
diff --git a/configs/levit/levit_128s_ascend_v2_plus.yaml b/configs/levit/levit_128s_ascend_v2_plus.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..94111b911513962e173cdf4c484e49b1c23bdd3f
--- /dev/null
+++ b/configs/levit/levit_128s_ascend_v2_plus.yaml
@@ -0,0 +1,55 @@
+# system config
+mode: 0
+distribute: True
+num_parallel_workers: 8
+
+# dataset config
+dataset: 'imagenet'
+data_dir: './imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 64
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: './ckpt/'
+epoch_size: 500
+dataset_sink_mode: True
+amp_level: 'O2'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.00005
+min_lr: 0.000001
+warmup_epochs: 5
+decay_epochs: 50
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
\ No newline at end of file
diff --git a/configs/levit/levit_128s_ascend_v2_se.yaml b/configs/levit/levit_128s_ascend_v2_se.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90e98cae6413dfdb96a8b9cec2054c748f50cfb3
--- /dev/null
+++ b/configs/levit/levit_128s_ascend_v2_se.yaml
@@ -0,0 +1,55 @@
+# system config
+mode: 0
+distribute: Flase
+num_parallel_workers: 1
+
+# dataset config
+dataset: 'imagenet'
+data_dir: './imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 32
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: './ckpt/'
+epoch_size: 300
+dataset_sink_mode: True
+amp_level: 'O2'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 30
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
\ No newline at end of file
diff --git a/configs/levit/levit_128s_gpu b/configs/levit/levit_128s_gpu
new file mode 100644
index 0000000000000000000000000000000000000000..8a95a971baf8aa7a6aca174c7ed4ce5b8d4618c1
--- /dev/null
+++ b/configs/levit/levit_128s_gpu
@@ -0,0 +1,56 @@
+# system config
+mode: 0
+distribute: False
+num_parallel_workers: 1
+val_while_train: True
+
+# dataset config
+dataset: 'imagenet'
+data_dir: '/cache/dataset/imagenet/imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 128
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: './ckpt'
+epoch_size: 300
+dataset_sink_mode: True
+amp_level: 'O3'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 30
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
\ No newline at end of file
diff --git a/configs/levit/levit_128s_new.yaml b/configs/levit/levit_128s_new.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..459d246f162ad6a16a6eb1c3514940a757e693e2
--- /dev/null
+++ b/configs/levit/levit_128s_new.yaml
@@ -0,0 +1,58 @@
+# system config
+mode: 0
+distribute: True
+num_parallel_workers: 8
+val_while_train: True
+enable_modelarts: True
+
+# dataset config
+dataset: 'imagenet'
+data_dir: '/cache/dataset/imagenet/imagenet/'   # './imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 256
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: '/cache/output/'
+epoch_size: 350
+dataset_sink_mode: True
+amp_level: 'O2'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 345
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
+eps: 1e-8
diff --git a/configs/levit/levit_128s_new_4NPU.yaml b/configs/levit/levit_128s_new_4NPU.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..43df957f482fb22ee9323eaa9e40f1952c6d7fbe
--- /dev/null
+++ b/configs/levit/levit_128s_new_4NPU.yaml
@@ -0,0 +1,57 @@
+# system config
+mode: 0
+distribute: True
+num_parallel_workers: 4
+val_while_train: True
+
+# dataset config
+dataset: 'imagenet'
+data_dir: './imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 256
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: './ckpt/'
+epoch_size: 350
+dataset_sink_mode: True
+amp_level: 'O2'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 345
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
+eps: 1e-8
diff --git a/configs/levit/levit_128s_new_distillation.yaml b/configs/levit/levit_128s_new_distillation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f9f3d00d062ddb2f03345b9bf244cf038f2be68e
--- /dev/null
+++ b/configs/levit/levit_128s_new_distillation.yaml
@@ -0,0 +1,67 @@
+# system config
+mode: 0
+distribute: True
+num_parallel_workers: 8
+val_while_train: True
+enable_modelarts: True
+
+# dataset config
+dataset: 'imagenet'
+data_dir: '/cache/dataset/imagenet/imagenet/'   # './imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 256
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: '/cache/output/'
+epoch_size: 350
+dataset_sink_mode: True
+amp_level: 'O2'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 345
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
+eps: 1e-8
+
+# Distilation
+distillation_type: hard
+teacher_path: '/cache/code/levit_new/regnety_160.ckpt'
+teacher_model: regnety_160
+distillation_alpha: 0.5
+distillation_tau: 1.0
+bce_loss: false
+
diff --git a/configs/levit/levit_128s_new_lr.yaml b/configs/levit/levit_128s_new_lr.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..856471446d997ae80d5063876461708d2d78e4bd
--- /dev/null
+++ b/configs/levit/levit_128s_new_lr.yaml
@@ -0,0 +1,58 @@
+# system config
+mode: 0
+distribute: True
+num_parallel_workers: 8
+val_while_train: True
+enable_modelarts: True
+
+# dataset config
+dataset: 'imagenet'
+data_dir: '/cache/dataset/imagenet/imagenet/'   # './imagenet/'
+shuffle: True
+dataset_download: False
+batch_size: 256
+drop_remainder: True
+
+# Augmentation config
+image_resize: 224
+scale: [0.08, 1.0]
+ratio: [0.75, 1.333]
+hflip: 0.5
+interpolation: 'bicubic'
+auto_augment: 'randaug-m9-mstd0.5-inc1'
+re_prob: 0.25
+mixup: 0.2 # 0.8
+cutmix: 1.0
+cutmix_prob: 1.0
+color_jitter: 0.4
+
+# model config
+model: 'LeViT_128S'
+num_classes: 1000
+pretrained: False
+ckpt_path: ''
+keep_checkpoint_max: 10
+ckpt_save_dir: '/cache/output/'
+epoch_size: 350
+dataset_sink_mode: True
+amp_level: 'O2'
+
+# loss config
+loss: 'CE'
+label_smoothing: 0.1
+
+# lr scheduler config
+scheduler: 'warmup_cosine_decay'
+lr: 0.00005 # 0.0005
+min_lr: 0.00001
+warmup_epochs: 5
+decay_epochs: 345
+decay_rate: 0.1
+
+# optimizer config
+opt: 'adamw'
+weight_decay: 0.025
+momentum: 0.9
+loss_scale: 1024
+use_nesterov: False
+eps: 1e-8
diff --git a/data_dump.json b/data_dump.json
new file mode 100644
index 0000000000000000000000000000000000000000..757dbc05da43bef105e965c545a474609070988c
--- /dev/null
+++ b/data_dump.json
@@ -0,0 +1,14 @@
+{
+    "common_dump_settings": {
+        "dump_mode": 0,
+        "path": "",
+        "net_name": "LeViT",
+        "iteration": "0|3-4",
+        "saved_data": "tensor",
+        "input_output": 0,
+        "kernels": [""],
+        "support_device": [0,1,2,3,4,5,6,7],
+        "op_debug_mode": 3,
+        "file_format": "npy"
+    }
+}
\ No newline at end of file
diff --git a/mindcv/loss/distillation/__init__.py b/mindcv/loss/distillation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/mindcv/loss/distillation/criterion.py b/mindcv/loss/distillation/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..01be8fef1f630739c814db087280352913f47dcd
--- /dev/null
+++ b/mindcv/loss/distillation/criterion.py
@@ -0,0 +1,284 @@
+# Copyright 2022 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""functions of criterion"""
+import mindspore as ms
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore import ops
+from mindspore.common import dtype as mstype
+from mindspore.nn.loss.loss import LossBase
+from mindspore.ops import (
+    functional,
+    operations,
+    LogSoftmax,
+    KLDivLoss,
+    Size
+)
+
+from .regnet import regnety_160
+
+
+# from .factory import create_teacher_model
+
+class SoftTargetCrossEntropy(LossBase):
+    """SoftTargetCrossEntropy for MixUp Augment"""
+
+    def __init__(self):
+        super(SoftTargetCrossEntropy, self).__init__()
+        self.mean_ops = operations.ReduceMean(keep_dims=False)
+        self.sum_ops = operations.ReduceSum(keep_dims=False)
+        self.log_softmax = operations.LogSoftmax()
+
+    def construct(self, logits, labels):
+        logits = operations.Cast()(logits, mstype.float32)
+        labels = operations.Cast()(labels, mstype.float32)
+        loss = self.sum_ops((-1 * labels) * self.log_softmax(logits), -1)
+        return self.mean_ops(loss)
+
+
+class CrossEntropySmooth(LossBase):
+    """CrossEntropy"""
+
+    def __init__(self, sparse=True, reduction='mean',
+                 smooth_factor=0., num_classes=1000):
+        super(CrossEntropySmooth, self).__init__()
+        self.onehot = operations.OneHot()
+        self.sparse = sparse
+        self.on_value = Tensor(1.0 - smooth_factor, mstype.float32)
+        self.off_value = Tensor(
+            1.0 * smooth_factor / (num_classes - 1), mstype.float32
+        )
+        self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction)
+        self.cast = ops.Cast()
+
+    def construct(self, logits, labels):
+        if self.sparse:
+            labels = self.onehot(
+                labels, functional.shape(logits)[1],
+                self.on_value, self.off_value
+            )
+        labels = operations.Cast()(labels, mstype.float32)
+        logits = operations.Cast()(logits, mstype.float32)
+        loss2 = self.ce(logits, labels)
+        return loss2
+
+
+class DistillationLoss(LossBase):
+    """
+    This module wraps a standard criterion and adds an extra knowledge
+    distillation loss by taking a teacher model prediction and
+    using it as additional supervision.
+    """
+
+    def __init__(self, base_criterion: LossBase, teacher_model: nn.Cell,
+                 distillation_type: str, alpha: float, tau: float):
+        super().__init__()
+        self.base_criterion = base_criterion
+        self.teacher_model = teacher_model
+        assert distillation_type in ['none', 'soft', 'hard']
+        self.distillation_type = distillation_type
+        self.alpha = alpha
+        self.tau = tau
+
+        self.kl_div = KLDivLoss(reduction='batchmean')
+        self.log_softmax = LogSoftmax(axis=1)
+        self.cross_entropy = nn.CrossEntropyLoss()
+
+    def construct(self, inputs, outputs, labels):
+        """
+        Args:
+        inputs: The original inputs that are feed to the teacher model
+        outputs: the outputs of the model to be trained. It is expected to be
+            either a Tensor, or a Tuple[Tensor, Tensor], with the original output
+            in the first position and the distillation predictions as the second output
+        labels: the labels for the base criterion
+        """
+        outputs_kd = None
+        if not isinstance(outputs, ms.Tensor):
+            # assume that the model outputs a tuple of [outputs, outputs_kd]
+            outputs, outputs_kd = outputs
+        base_loss = self.base_criterion(outputs, labels)
+
+        if self.distillation_type == 'none':
+            return base_loss
+
+        if outputs_kd is None:
+            raise ValueError("When knowledge distillation is enabled, the model is "
+                             "expected to return a Tuple[Tensor, Tensor] with the output of the "
+                             "class_token and the dist_token")
+
+        teacher_outputs = self.teacher_model(inputs)
+        dist_loss = 0.0
+        if self.distillation_type == 'soft':
+            T = self.tau
+            # taken from https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
+            # with slight modifications
+            dist_loss = self.kl_div(
+                self.log_softmax(outputs_kd / T),
+                # We provide the teacher's targets in log probability because we use log_target=True (as recommended
+                # in pytorch https://github.com/pytorch/pytorch/blob/9324181d0ac7b4f7949a574dbc3e8be30abe7041/torch
+                # /nn/functional.py#L2719) but it is possible to give just the probabilities and set
+                # log_target=False. In our experiments we tried both.
+                self.log_softmax(teacher_outputs / T),
+                reduction='sum',
+            ) * (T * T) / Size()(outputs_kd)
+            # We divide by outputs_kd.numel() to have the legacy PyTorch behavior.
+            # But we also experiments output_kd.size(0)
+            # see issue 61(https://github.com/facebookresearch/deit/issues/61) for more details
+        elif self.distillation_type == 'hard':
+            dist_loss = self.cross_entropy(
+                outputs_kd, teacher_outputs.argmax(axis=1)
+            )
+
+        loss = base_loss * (1 - self.alpha) + dist_loss * self.alpha
+        return loss
+
+
+def get_model_by_name(model_name, **kwargs):
+    """get network by name and initialize it"""
+
+    models = {
+        'regnety_160': regnety_160
+    }
+    return models[model_name](**kwargs)
+
+
+def create_teacher_model(
+        model_name,
+        checkpoint_path=None,
+        **kwargs):
+    """Create model by name with given parameters"""
+
+    model = get_model_by_name(
+        model_name, **kwargs
+    )
+    if checkpoint_path is not None:
+        param_dict = ms.load_checkpoint(checkpoint_path)
+        ms.load_param_into_net(model, param_dict)
+
+    return model
+
+
+# def get_criterion_by_args(args):
+#     criterion = get_criterion(
+#         smoothing=args.smoothing,  # 0.1
+#         num_classes=args.num_classes,  # 1000
+#         mixup=args.mixup,  # 0.8
+#         cutmix=args.cutmix,  # 1.0
+#         cutmix_minmax=args.cutmix_minmax,  # null
+#         bce_loss=args.bce_loss,  # flase
+#         distillation_type=args.distillation_type,  # hard
+#         teacher_path=args.teacher_path,  # ckpt
+#         teacher_model=args.teacher_model,  # regnety_160
+#         distillation_alpha=args.distillation_alpha,  # 0.5
+#         distillation_tau=args.distillation_tau  # 1.0
+#     )
+#     return criterion
+
+def get_criterion_by_args(args):
+    criterion = get_criterion(
+        smoothing=args.label_smoothing,  # 0.1 #修改smoothing
+        num_classes=args.num_classes,  # 1000  #ok
+        mixup=args.mixup,  # 0.8 #ok
+        cutmix=args.cutmix,  # 1.0 #ok
+        # add参数
+        cutmix_minmax=args.cutmix_minmax,  # null
+        bce_loss=args.bce_loss,  # flase
+        distillation_type=args.distillation_type,  # hard
+        teacher_path=args.teacher_path,  # ckpt
+        teacher_model=args.teacher_model,  # regnety_160
+        distillation_alpha=args.distillation_alpha,  # 0.5
+        distillation_tau=args.distillation_tau  # 1.0
+    )
+    return criterion
+
+
+def get_criterion(
+        smoothing,
+        num_classes,
+        mixup,
+        cutmix,
+        # cutmix_minmax,
+        bce_loss,
+        distillation_type,
+        teacher_path,
+        teacher_model,
+        distillation_alpha,
+        distillation_tau
+):
+    """Get criterion function"""
+    assert smoothing >= 0
+    assert smoothing <= 1.
+
+    mixup_active = False
+    if mixup > 0:
+        mixup_active = True
+    if cutmix > 0:
+        mixup_active = True
+
+    if mixup_active:
+        # smoothing is handled with mixup label transform
+        print(25 * "=" + "Using MixBatch" + 25 * "=")
+        criterion = SoftTargetCrossEntropy()
+    elif smoothing:
+        print(25 * "=" + "Using label smoothing" + 25 * "=")
+        criterion = CrossEntropySmooth(sparse=True, reduction="mean",
+                                       smooth_factor=smoothing,
+                                       num_classes=num_classes)
+    else:
+        criterion = nn.SoftmaxCrossEntropyWithLogits()
+
+    if bce_loss:
+        criterion = nn.BCEWithLogitsLoss()
+
+    teacher_net = None
+    if distillation_type != 'none':
+        assert teacher_path, 'need to specify teacher-path when using distillation'
+        print(f"Creating teacher model: {teacher_model}")
+        teacher_net = create_teacher_model(
+            teacher_model,
+            checkpoint_path=teacher_path,
+        )
+        teacher_net.set_train(False)
+
+    # wrap the criterion in our custom DistillationLoss, which
+    # just dispatches to the original criterion if distillation_type is 'none'
+    criterion = DistillationLoss(
+        criterion,
+        teacher_net,  # regnety_160
+        distillation_type,  # none
+        distillation_alpha,  # 0.5
+        distillation_tau  # 1.0
+    )
+
+    return criterion
+
+
+class NetWithLoss(nn.Cell):
+    """
+    NetWithLoss: Only support Network with Classification.
+    """
+
+    def __init__(self, model, criterion):
+        super(NetWithLoss, self).__init__()
+        self.model = model
+        self.criterion = criterion
+
+    def construct(self, *inputs, **kwargs):
+        data = inputs[0]
+        label = inputs[1]
+        predict = self.model(data)
+        loss = self.criterion(data, predict, label)
+        return loss
diff --git a/mindcv/loss/distillation/regnet.py b/mindcv/loss/distillation/regnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b350b9521be86b2cf0f76a2e3c1b83069157cf9f
--- /dev/null
+++ b/mindcv/loss/distillation/regnet.py
@@ -0,0 +1,650 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+
+import types
+import functools
+import numpy as np
+import mindspore as ms
+import mindspore.nn as nn
+import mindspore.ops as ops
+
+from mindspore.ops import Div, UniformReal, Floor
+from copy import deepcopy
+from typing import Callable, Tuple
+
+# from .mindcv.models.layers.pooling import GlobalAvgPooling
+
+# from .classifier import ClassifierHead
+# from .conv_bn_act import ConvBnAct
+# from .drop_path import DropPath
+# from .se import SEModule
+
+
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def _cfg(url=''):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224),
+        'pool_size': (7, 7), 'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv', 'classifier': 'head.fc',
+    }
+
+
+def _mcfg(**kwargs):
+    cfg = dict(se_ratio=0., bottle_ratio=1., stem_width=32)
+    cfg.update(**kwargs)
+    return cfg
+
+
+# Model FLOPS = three trailing digits * 10^8
+model_cfgs = dict(
+    regnetx_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13),
+    regnetx_004=_mcfg(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22),
+    regnetx_006=_mcfg(w0=48, wa=36.97, wm=2.24, group_w=24, depth=16),
+    regnetx_008=_mcfg(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16),
+    regnetx_016=_mcfg(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18),
+    regnetx_032=_mcfg(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25),
+    regnetx_040=_mcfg(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23),
+    regnetx_064=_mcfg(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17),
+    regnetx_080=_mcfg(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23),
+    regnetx_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19),
+    regnetx_160=_mcfg(w0=216, wa=55.59, wm=2.1, group_w=128, depth=22),
+    regnetx_320=_mcfg(w0=320, wa=69.86, wm=2.0, group_w=168, depth=23),
+    regnety_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13, se_ratio=0.25),
+    regnety_004=_mcfg(w0=48, wa=27.89, wm=2.09, group_w=8, depth=16, se_ratio=0.25),
+    regnety_006=_mcfg(w0=48, wa=32.54, wm=2.32, group_w=16, depth=15, se_ratio=0.25),
+    regnety_008=_mcfg(w0=56, wa=38.84, wm=2.4, group_w=16, depth=14, se_ratio=0.25),
+    regnety_016=_mcfg(w0=48, wa=20.71, wm=2.65, group_w=24, depth=27, se_ratio=0.25),
+    regnety_032=_mcfg(w0=80, wa=42.63, wm=2.66, group_w=24, depth=21, se_ratio=0.25),
+    regnety_040=_mcfg(w0=96, wa=31.41, wm=2.24, group_w=64, depth=22, se_ratio=0.25),
+    regnety_064=_mcfg(w0=112, wa=33.22, wm=2.27, group_w=72, depth=25, se_ratio=0.25),
+    regnety_080=_mcfg(w0=192, wa=76.82, wm=2.19, group_w=56, depth=17, se_ratio=0.25),
+    regnety_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, se_ratio=0.25),
+    regnety_160=_mcfg(w0=200, wa=106.23, wm=2.48, group_w=112, depth=18, se_ratio=0.25),
+    regnety_320=_mcfg(w0=232, wa=115.89, wm=2.53, group_w=232, depth=20, se_ratio=0.25),
+)
+
+
+#  ClassifierHead
+def adaptive_pool_feat_mult(pool_type='avg'):
+    if pool_type == 'catavgmax':
+        return 2
+    return 1
+
+
+class GlobalAvgPooling(nn.Cell):
+    """
+    GlobalAvgPooling, same as torch.nn.AdaptiveAvgPool2d when output shape is 1
+    """
+
+    def __init__(self, keep_dims: bool = False) -> None:
+        super().__init__()
+        self.keep_dims = keep_dims
+
+    def construct(self, x):
+        x = ops.mean(x, axis=(2, 3), keep_dims=self.keep_dims)
+        return x
+
+
+# class SelectAdaptivePool2d(nn.Cell):
+#     """Selectable global pooling layer with dynamic input kernel size
+#     """
+#
+#     def __init__(self, output_size=1, pool_type='avg', flatten=False):
+#         super(SelectAdaptivePool2d, self).__init__()
+#         self.pool_type = pool_type or ''
+#         self.flatten = flatten
+#         # self.pool = GlobalAvgPooling(output_size)
+#         self.pool = ms.ops.AdaptiveAvgPool2D(output_size)
+#
+#     def is_identity(self):
+#         return self.pool_type == ''
+#
+#     def construct(self, *inputs, **kwargs):
+#         x = inputs[0]
+#         x = self.pool(x)
+#         if self.flatten:
+#             x = ms.nn.Flatten()(x)
+#         return x
+#
+#     def feat_mult(self):
+#         return adaptive_pool_feat_mult(self.pool_type)
+#
+#     def __repr__(self):
+#         return self.__class__.__name__ + ' (' \
+#             + 'pool_type=' + self.pool_type \
+#             + ', flatten=' + str(self.flatten) + ')'
+
+
+# def create_classifier(num_features, num_classes, pool_type='avg', use_conv=False):
+#     flatten = not use_conv  # flatten when we use a Linear layer after pooling
+#     if not pool_type:
+#         assert num_classes == 0 or use_conv, \
+#             'Pooling can only be disabled if classifier is also removed or conv classifier is used'
+#         flatten = False  # disable flattening if pooling is pass-through (no pooling)
+#     global_pool = SelectAdaptivePool2d(pool_type=pool_type, flatten=flatten)
+#     # global_pool = GlobalAvgPooling()
+#     num_pooled_features = num_features * global_pool.feat_mult()
+#     if num_classes <= 0:
+#         fc = nn.Identity()  # pass-through (no classifier)
+#     elif use_conv:
+#         fc = nn.Conv2d(num_pooled_features, num_classes, 1, has_bias=True, pad_mode='valid')
+#     else:
+#         # NOTE: using my Linear wrapper that fixes AMP + torchscript casting issue
+#         fc = nn.Dense(num_pooled_features, num_classes, has_bias=True)
+#     return global_pool, fc
+
+
+class ClassifierHead(nn.Cell):
+    """Classifier head w/ configurable global pooling and dropout."""
+
+    def __init__(self, in_chs, num_classes, pool_type='avg', drop_rate=0.):
+        super(ClassifierHead, self).__init__()
+        self.drop_rate = drop_rate
+        # self.global_pool, self.fc = create_classifier(in_chs, num_classes, pool_type=pool_type)
+        self.global_pool = GlobalAvgPooling()
+        self.fc = nn.Dense(in_chs, num_classes, has_bias=True)
+        self.dropout = nn.Dropout(1.0 - float(self.drop_rate))
+
+    def construct(self, *inputs, **kwargs):
+        x = inputs[0]
+        x = self.global_pool(x)
+        if self.drop_rate:
+            x = self.dropout(x)
+        x = self.fc(x)
+        return x
+
+
+# ConvBnAct
+def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+# Can SAME padding for given args be done statically?
+def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
+    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+
+
+def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
+    dynamic = False
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == 'same':
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = get_padding(kernel_size, **kwargs)
+            else:
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
+                padding = 0
+                dynamic = True
+        elif padding == 'valid':
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = get_padding(kernel_size, **kwargs)
+    return padding, dynamic
+
+
+def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
+    padding = kwargs.pop('padding', 'valid')
+    kwargs.setdefault('has_bias', False)
+    padding, _ = get_padding_value(padding, kernel_size, **kwargs)
+    if padding != 0:
+        pad_mode = 'pad'
+    else:
+        pad_mode = 'valid'
+    return nn.Conv2d(
+        in_chs, out_chs, kernel_size,
+        padding=padding, pad_mode=pad_mode, **kwargs
+    )
+
+
+def create_conv2d(in_channels, out_channels, kernel_size, **kwargs):
+    """ Select a 2d convolution implementation based on arguments
+    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d.
+
+    Used extensively by EfficientNet, MobileNetv3 and related networks.
+    """
+    depthwise = kwargs.pop('depthwise', False)
+    groups = out_channels if depthwise else kwargs.pop('group', 1)
+
+    m = create_conv2d_pad(in_channels, out_channels, kernel_size, group=groups, **kwargs)
+    return m
+
+
+class BatchNormAct2d(nn.BatchNorm2d):
+    """BatchNorm + Activation
+
+    This module performs BatchNorm + Activation in a manner that will remain backwards
+    compatible with weights trained with separate bn, act. This is why we inherit from BN
+    instead of composing it as a .bn member.
+    """
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
+                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
+        super(BatchNormAct2d, self).__init__(
+            num_features, eps=eps, momentum=1.0 - momentum, affine=affine)
+
+        if act_layer is not None and apply_act:
+            # act_args = dict(inplace=True) if inplace else {}
+            self.act = act_layer()
+        else:
+            self.act = None
+
+    def _forward_python(self, x):
+        return super(BatchNormAct2d, self).construct(x)
+
+    def construct(self, *inputs, **kwargs):
+        x = inputs[0]
+        x = self._forward_python(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+def convert_norm_act_type(norm_layer, act_layer, norm_kwargs=None):
+    assert isinstance(norm_layer, (type, str, types.FunctionType, functools.partial))
+    assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial))
+    norm_act_args = norm_kwargs.copy() if norm_kwargs else {}
+    norm_act_layer = BatchNormAct2d
+    # Must pass `act_layer` through for backwards compat where `act_layer=None` implies no activation.
+    # In the future, may force use of `apply_act` with `act_layer` arg bound to relevant NormAct types
+    # It is intended that functions/partial does not trigger this, they should define act.
+    norm_act_args.update(dict(act_layer=act_layer))
+    return norm_act_layer, norm_act_args
+
+
+class ConvBnAct(nn.Cell):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, act_layer=nn.ReLU, apply_act=True,
+                 drop_block=None, aa_layer=None):
+        super(ConvBnAct, self).__init__()
+        use_aa = aa_layer is not None
+
+        self.conv = create_conv2d(
+            in_channels, out_channels, kernel_size, stride=1 if use_aa else stride,
+            padding=padding, dilation=dilation, group=groups, has_bias=False)
+
+        # NOTE for backwards compatibility with models that use separate norm and act layer definitions
+        norm_act_layer, norm_act_args = convert_norm_act_type(norm_layer, act_layer, norm_kwargs)
+        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block, **norm_act_args)
+        self.aa = aa_layer(channels=out_channels) if stride == 2 and use_aa else None
+
+    @property
+    def in_channels(self):
+        return self.conv.in_channels
+
+    @property
+    def out_channels(self):
+        return self.conv.out_channels
+
+    def construct(self, *inputs, **kwargs):
+        x = inputs[0]
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.aa is not None:
+            x = self.aa(x)
+        return x
+
+
+# DropPath
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+
+    random_tensor = keep_prob + UniformReal()(shape)
+    random_tensor = Floor()(random_tensor)  # binarize
+    output = Div()(x, keep_prob) * random_tensor
+    return output
+
+
+# SEModule
+class DropPath(nn.Cell):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def construct(self, *inputs, **kwargs):
+        x = inputs[0]
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class SEModule(nn.Cell):
+
+    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, min_channels=8, reduction_channels=None,
+                 gate_layer=nn.Sigmoid):
+        super(SEModule, self).__init__()
+        reduction_channels = reduction_channels or max(channels // reduction, min_channels)
+        self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, has_bias=True, pad_mode='valid')
+        self.act = act_layer()  # inplace=True)
+        self.fc2 = nn.Conv2d(reduction_channels, channels, kernel_size=1, has_bias=True, pad_mode='valid')
+        self.gate = gate_layer()
+
+    def construct(self, *inputs, **kwargs):
+        x = inputs[0]
+        x_se = x.mean((2, 3), keep_dims=True)
+        x_se = self.fc1(x_se)
+        x_se = self.act(x_se)
+        x_se = self.fc2(x_se)
+        return x * self.gate(x_se)
+
+
+def quantize_float(f, q):
+    """Converts a float to closest non-zero int divisible by q."""
+    return int(round(f / q) * q)
+
+
+def adjust_widths_groups_comp(widths, bottle_ratios, groups):
+    """Adjusts the compatibility of widths and groups."""
+    bottleneck_widths = [int(w * b) for w, b in zip(widths, bottle_ratios)]
+    groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_widths)]
+    bottleneck_widths = [quantize_float(w_bot, g) for w_bot, g in
+                         zip(bottleneck_widths, groups)]
+    widths = [int(w_bot / b) for w_bot, b in
+              zip(bottleneck_widths, bottle_ratios)]
+    return widths, groups
+
+
+def generate_regnet(width_slope, width_initial, width_mult, depth, q=8):
+    """Generates per block widths from RegNet parameters."""
+    assert width_slope >= 0 and width_initial > 0 and width_mult > 1 and width_initial % q == 0
+    widths_cont = np.arange(depth) * width_slope + width_initial
+    width_exps = np.round(
+        np.log(widths_cont / width_initial) / np.log(width_mult))
+    widths = width_initial * np.power(width_mult, width_exps)
+    widths = np.round(np.divide(widths, q)) * q
+    num_stages, max_stage = len(np.unique(widths)), width_exps.max() + 1
+    widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist()
+    return widths, num_stages, max_stage, widths_cont
+
+
+class Bottleneck(nn.Cell):
+    """ RegNet Bottleneck
+
+    This is almost exactly the same as a ResNet Bottleneck. The main difference is the SE block is moved from
+    after conv3 to after conv2. Otherwise, it's just redefining the arguments for groups/bottleneck channels.
+    """
+
+    def __init__(self, in_chs, out_chs, stride=1, dilation=1,
+                 bottleneck_ratio=1, group_width=1, se_ratio=0.25,
+                 downsample=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d,
+                 aa_layer=None,
+                 drop_block=None, drop_path=None):
+
+        super(Bottleneck, self).__init__()
+        bottleneck_chs = int(round(out_chs * bottleneck_ratio))
+        groups = bottleneck_chs // group_width
+
+        cargs = dict(act_layer=act_layer, norm_layer=norm_layer,
+                     aa_layer=aa_layer, drop_block=drop_block)
+        self.conv1 = ConvBnAct(in_chs, bottleneck_chs, kernel_size=1, **cargs)
+        self.conv2 = ConvBnAct(
+            bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride,
+            dilation=dilation,
+            groups=groups, **cargs)
+        if se_ratio:
+            se_channels = int(round(in_chs * se_ratio))
+            self.se = SEModule(bottleneck_chs, reduction_channels=se_channels)
+        else:
+            self.se = None
+        cargs['act_layer'] = None
+        self.conv3 = ConvBnAct(bottleneck_chs, out_chs, kernel_size=1, **cargs)
+        self.act3 = act_layer()
+        self.downsample = downsample
+        self.drop_path = drop_path
+
+    def zero_init_last_bn(self):
+        nn.init.zeros_(self.conv3.bn.weight)
+
+    def construct(self, *inputs, **kwargs):
+        x = inputs[0]
+        shortcut = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        if self.se is not None:
+            x = self.se(x)
+        x = self.conv3(x)
+        if self.drop_path is not None:
+            x = self.drop_path(x)
+        if self.downsample is not None:
+            shortcut = self.downsample(shortcut)
+        x += shortcut
+        x = self.act3(x)
+        return x
+
+
+def downsample_conv(
+        in_chs, out_chs, kernel_size, stride=1, dilation=1, norm_layer=None):
+    norm_layer = norm_layer or nn.BatchNorm2d
+    kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size
+    dilation = dilation if kernel_size > 1 else 1
+    return ConvBnAct(
+        in_chs, out_chs, kernel_size, stride=stride, dilation=dilation,
+        norm_layer=norm_layer, act_layer=None)
+
+
+class RegStage(nn.Cell):
+    """Stage (sequence of blocks w/ the same output shape)."""
+
+    def __init__(self, in_chs, out_chs, stride, dilation, depth, bottle_ratio,
+                 group_width,
+                 block_fn=Bottleneck, se_ratio=0., drop_path_rates=None,
+                 drop_block=None):
+        super(RegStage, self).__init__()
+        block_kwargs = {}  # FIXME setup to pass various aa, norm, act layer common args
+        first_dilation = 1 if dilation in (1, 2) else 2
+
+        list_of_block_fn = [nn.Identity()]
+
+        for i in range(depth):
+            block_stride = stride if i == 0 else 1
+            block_in_chs = in_chs if i == 0 else out_chs
+            block_dilation = first_dilation if i == 0 else dilation
+            if drop_path_rates is not None and drop_path_rates[i] > 0.:
+                drop_path = DropPath(drop_path_rates[i])
+            else:
+                drop_path = None
+            if (block_in_chs != out_chs) or (block_stride != 1):
+                proj_block = downsample_conv(block_in_chs, out_chs, 1,
+                                             block_stride, block_dilation)
+            else:
+                proj_block = None
+
+            list_of_block_fn.append(
+                block_fn(
+                    block_in_chs, out_chs, block_stride, block_dilation,
+                    bottle_ratio, group_width, se_ratio,
+                    downsample=proj_block, drop_block=drop_block,
+                    drop_path=drop_path, **block_kwargs)
+            )
+
+        self.b = nn.CellList(list_of_block_fn)
+
+    def construct(self, *inputs, **kwargs):
+        x = inputs[0]
+        for block in self.b:
+            x = block(x)
+        return x
+
+
+class RegNet(nn.Cell):
+    """RegNet model.
+
+    Paper: https://arxiv.org/abs/2003.13678
+
+    Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py
+    """
+
+    def __init__(self, cfg, in_chans=3, num_classes=1000, output_stride=32,
+                 global_pool='avg', drop_rate=0.,
+                 drop_path_rate=0., zero_init_last_bn=True):
+
+        super().__init__()
+        # TODO add drop block, drop path, anti-aliasing, custom bn/act args
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        assert output_stride in (8, 16, 32)
+
+        # Construct the stem
+        stem_width = cfg['stem_width']
+        self.stem = ConvBnAct(in_chans, stem_width, 3, stride=2)
+        self.feature_info = [
+            dict(num_chs=stem_width, reduction=2, module='stem')
+        ]
+
+        # Construct the stages
+        prev_width = stem_width
+        curr_stride = 2
+        stage_params = self._get_stage_params(cfg, output_stride=output_stride,
+                                              drop_path_rate=drop_path_rate)
+        se_ratio = cfg['se_ratio']
+        list_of_regstages = [nn.Identity()]
+        for i, stage_args in enumerate(stage_params):
+            stage_name = "s{}".format(i + 1)
+            list_of_regstages.append(RegStage(prev_width, **stage_args, se_ratio=se_ratio))
+            prev_width = stage_args['out_chs']
+            curr_stride *= stage_args['stride']
+            self.feature_info += [
+                dict(num_chs=prev_width, reduction=curr_stride,
+                     module=stage_name)]
+        self.s = nn.CellList(list_of_regstages)
+        # Construct the head
+        self.num_features = prev_width
+        self.head = ClassifierHead(
+            in_chs=prev_width, num_classes=num_classes, pool_type=global_pool,
+            drop_rate=drop_rate)
+
+    def _get_stage_params(self, cfg, default_stride=2, output_stride=32,
+                          drop_path_rate=0.):
+        # Generate RegNet ws per block
+        w_a, w_0, w_m, d = cfg['wa'], cfg['w0'], cfg['wm'], cfg['depth']
+        widths, num_stages, _, _ = generate_regnet(w_a, w_0, w_m, d)
+
+        # Convert to per stage format
+        stage_widths, stage_depths = np.unique(widths, return_counts=True)
+
+        # Use the same group width, bottleneck mult and stride for each stage
+        stage_groups = [cfg['group_w'] for _ in range(num_stages)]
+        stage_bottle_ratios = [cfg['bottle_ratio'] for _ in range(num_stages)]
+        stage_strides = []
+        stage_dilations = []
+        net_stride = 2
+        dilation = 1
+        for _ in range(num_stages):
+            if net_stride >= output_stride:
+                dilation *= default_stride
+                stride = 1
+            else:
+                stride = default_stride
+                net_stride *= stride
+            stage_strides.append(stride)
+            stage_dilations.append(dilation)
+        stage_dpr = np.split(np.linspace(0, drop_path_rate, d),
+                             np.cumsum(stage_depths[:-1]))
+
+        # Adjust the compatibility of ws and gws
+        stage_widths, stage_groups = adjust_widths_groups_comp(stage_widths,
+                                                               stage_bottle_ratios,
+                                                               stage_groups)
+        param_names = ['out_chs', 'stride', 'dilation', 'depth',
+                       'bottle_ratio', 'group_width', 'drop_path_rates']
+        stage_params = [
+            dict(zip(param_names, params)) for params in
+            zip(stage_widths, stage_strides, stage_dilations, stage_depths,
+                stage_bottle_ratios, stage_groups,
+                stage_dpr)]
+        return stage_params
+
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes,
+                                   pool_type=global_pool,
+                                   drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        for block in list(self.children())[:-1]:
+            x = block(x)
+        return x
+
+    def construct(self, *inputs, **kwargs):
+        x = inputs[0]
+
+        x = self.stem(x)
+        for block in self.s:
+            x = block(x)
+        x = self.head(x)
+        return x
+
+
+def build_model_with_cfg(
+        model_cls: Callable,  #
+        default_cfg: dict,  #
+        model_cfg: dict = None,  #
+        **kwargs):
+    model = model_cls(**kwargs) if model_cfg is None else model_cls(
+        cfg=model_cfg, **kwargs)
+    model.default_cfg = deepcopy(default_cfg)
+    return model
+
+
+def _create_regnet(variant, **kwargs):
+    return build_model_with_cfg(
+        RegNet, default_cfg=_cfg(),
+        model_cfg=model_cfgs[variant], **kwargs
+    )
+
+
+def regnety_160(**kwargs):
+    """RegNetY-16GF"""
+    return _create_regnet('regnety_160', **kwargs)
+
+# if __name__ == '__main__':
+#     import numpy as np
+#     import mindspore
+#     from mindspore import Tensor
+#     from mindspore import context
+#
+#     context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+#     # context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+#
+#     model = regnety_160()
+#     print(model)
+#     dummy_input = Tensor(np.random.rand(4, 3, 224, 224), dtype=mindspore.float32)
+#     y = model(dummy_input)
+#     print(y.shape)
diff --git a/mindcv/loss/loss_factory.py b/mindcv/loss/loss_factory.py
index 54659ec95b9a8d7fdd9325f9c5e4a0c7647e0a97..9284b49216b45584b8b37ff0f1c68e24a6964f08 100644
--- a/mindcv/loss/loss_factory.py
+++ b/mindcv/loss/loss_factory.py
@@ -7,16 +7,26 @@ from .asymmetric import AsymmetricLossMultilabel, AsymmetricLossSingleLabel
 from .binary_cross_entropy_smooth import BinaryCrossEntropySmooth
 from .cross_entropy_smooth import CrossEntropySmooth
 from .jsd import JSDCrossEntropy
+from .distillation.criterion import get_criterion
 
 __all__ = ["create_loss"]
 
 
 def create_loss(
-    name: str = "CE",
-    weight: Optional[Tensor] = None,
-    reduction: str = "mean",
-    label_smoothing: float = 0.0,
-    aux_factor: float = 0.0,
+        name: str = "CE",
+        weight: Optional[Tensor] = None,
+        reduction: str = "mean",
+        label_smoothing: float = 0.0,
+        aux_factor: float = 0.0,
+        num_classes: int = 1000,
+        mixup: float = 0.8,
+        cutmix: float = 1.0,
+        bce_loss: bool = False,
+        distillation_type: str = 'hard',
+        teacher_path: str = '',
+        teacher_model: str = 'regnety_160',
+        distillation_alpha: float = 0.5,
+        distillation_tau: float = 1.0,
 ):
     r"""Creates loss function
 
@@ -31,6 +41,15 @@ def create_loss(
             from overfitting when calculating Loss. The value range is [0.0, 1.0]. Default: 0.0.
         aux_factor (float): Auxiliary loss factor. Set aux_factor > 0.0 if the model has auxiliary logit outputs
             (i.e., deep supervision), like inception_v3. Default: 0.0.
+        num_classes:int =1000,
+        mixup:float = 0.8,
+        cutmix:float = 1.0,
+        bce_loss:boll=False,
+        distillation_type:str='hard',
+        teacher_path:str ='path_ckpt',
+        teacher_model:str ='regnety_160',
+        distillation_alpha:float=0.5,
+        distillation_tau:float=1.0,
 
     Inputs:
         - logits (Tensor or Tuple of Tensor): Input logits. Shape [N, C], where N means the number of samples,
@@ -58,6 +77,18 @@ def create_loss(
         loss = AsymmetricLossMultilabel()
     elif name == "jsd":
         loss = JSDCrossEntropy(smoothing=label_smoothing, aux_factor=aux_factor, reduction=reduction, weight=weight)
+    elif name == "distillation_for_levit":
+        loss = get_criterion(
+            smoothing=label_smoothing,  # 0.1 #修改smoothing
+            num_classes=num_classes,  # 1000  #ok
+            mixup=mixup,  # 0.8 #ok
+            cutmix=cutmix,  # 1.0 #ok
+            bce_loss=bce_loss,  # flase
+            distillation_type=distillation_type,  # hard
+            teacher_path=teacher_path,  # ckpt
+            teacher_model=teacher_model,  # regnety_160
+            distillation_alpha=distillation_alpha,  # 0.5
+            distillation_tau=distillation_tau)
     else:
         raise NotImplementedError
 
diff --git a/mindcv/models/__init__.py b/mindcv/models/__init__.py
index 6f28ba6f149c208d0d4a8755862443d2f6660496..5eda95bf45809e2e11fa312affc3921a2b9abe82 100644
--- a/mindcv/models/__init__.py
+++ b/mindcv/models/__init__.py
@@ -54,6 +54,7 @@ from . import (
     volo,
     xception,
     xcit,
+    levit,
 )
 from .bit import *
 from .cait import *
@@ -110,6 +111,7 @@ from .vit import *
 from .volo import *
 from .xception import *
 from .xcit import *
+from .levit import *
 
 # some net module is replaced by the net function with the same name when we do from .net import *
 # we cannot use net.__all__, so we manually copy net.__all__ here.
@@ -168,3 +170,4 @@ __all__.extend(vit.__all__)
 __all__.extend(volo.__all__)
 __all__.extend(["Xception", "xception"])
 __all__.extend(xcit.__all__)
+__all__.extend(levit.__all__)
diff --git a/mindcv/models/levit.py b/mindcv/models/levit.py
new file mode 100644
index 0000000000000000000000000000000000000000..0edd93ea9afce7c7603c4686c5c2aafa5731f099
--- /dev/null
+++ b/mindcv/models/levit.py
@@ -0,0 +1,699 @@
+"""
+MindSpore implementation of `LeViT`.
+Refer to LeViT: LeViT Improving Vision Transformerswith Soft Convolutional Inductive Biases
+"""
+import itertools
+import numpy as np
+
+import mindspore as ms
+import mindspore.nn as nn
+import mindspore.ops as ops
+import mindspore.common.initializer as init
+from mindspore.common.initializer import initializer, TruncatedNormal
+
+from mindspore import Parameter, Tensor
+from mindspore import load_checkpoint, load_param_into_net
+# from mindspore.ops import softmax as opsftmx
+
+from .helpers import load_pretrained
+from .registry import register_model
+
+# ms.set_context(mode=ms.PYNATIVE_MODE)
+# ms.set_context(aoe_tune_mode="online")
+
+# IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+# IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+__all__ = [
+    "LeViT",
+    "LeViT_128S",
+    "LeViT_128",
+    "LeViT_192",
+    "LeViT_256",
+    "LeViT_384",
+]
+
+
+def mindspore_params(network):
+    ms_params = {}
+    for param in network.get_parameters():
+        name = param.name
+        value = param.data.asnumpy()
+        print(name, value.shape)
+        ms_params[name] = value
+    return ms_params
+
+
+def _cfg(url='', **kwargs):  # need to check for
+    return {
+        'url': url,
+        'num_classes': 1000,
+        'input_size': (3, 224, 224),
+        # 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'LeViT_128S': _cfg(url=''),
+    'LeViT_128': _cfg(url=''),
+    'LeViT_192': _cfg(url=''),
+    'LeViT_256': _cfg(url=''),
+    'LeViT_384': _cfg(url='')
+}
+
+FLOPS_COUNTER = 0
+
+
+# 卷积、全连接和批归一化的各种组合以及残差层
+class Conv2d_BN(nn.SequentialCell):
+    def __init__(self,
+                 a: int,
+                 b: int,
+                 ks: int = 1,
+                 stride: int = 1,
+                 pad: int = 0,  # pad=1
+                 dilation: int = 1,
+                 group: int = 1,
+                 resolution: int = -10000) -> None:
+        super().__init__()
+        # 输入无误
+        self.conv = nn.Conv2d(in_channels=a,
+                              out_channels=b,
+                              kernel_size=ks,
+                              stride=stride,
+                              padding=pad,  # padding
+                              dilation=dilation,
+                              group=group,
+                              has_bias=False,
+                              pad_mode="pad")
+
+        self.bn = nn.BatchNorm2d(num_features=b,
+                                 gamma_init="ones",
+                                 beta_init="zeros",
+                                 use_batch_statistics=True,
+                                 momentum=0.9)  # 0.1
+
+        # global FLOPS_COUNTER
+        # output_points = ((resolution + 2 * pad - dilation *
+        #                   (ks - 1) - 1) // stride + 1) ** 2
+        # FLOPS_COUNTER += a * b * output_points * (ks ** 2) // group
+
+    def construct(self, x: Tensor) -> Tensor:
+        x = self.conv(x)
+        x = self.bn(x)
+        return x
+
+
+class Linear_BN(nn.SequentialCell):
+    def __init__(self,
+                 a: int,
+                 b: int,
+                 resolution: int = -100000) -> None:
+        super().__init__()
+
+        self.linear = nn.Dense(a,
+                               b,
+                               weight_init='Uniform',
+                               bias_init='Uniform',
+                               has_bias=False)
+
+        self.bn1d = nn.BatchNorm1d(num_features=b,
+                                   gamma_init="ones",
+                                   beta_init="zeros",
+                                   use_batch_statistics=True,
+                                   momentum=0.9)  # 0.1
+
+        # global FLOPS_COUNTER
+        # output_points = resolution ** 2
+        # FLOPS_COUNTER += a * b * output_points
+
+    def construct(self, x: Tensor) -> Tensor:
+        x = self.linear(x)
+        x1, x2, x3 = x.shape
+        new_x = ops.reshape(x, (x1 * x2, x3))
+        x = self.bn1d(new_x).reshape(x.shape)
+        return x
+
+
+class BN_Linear(nn.SequentialCell):
+    def __init__(self,
+                 a: int,
+                 b: int,
+                 bias: bool = True,
+                 std: float = 0.02) -> None:
+        super().__init__()
+
+        self.bn1d = nn.BatchNorm1d(num_features=a,
+                                   gamma_init="ones",
+                                   beta_init="zeros",
+                                   use_batch_statistics=True,
+                                   momentum=0.9)  # 0.1
+
+        self.linear = nn.Dense(a,
+                               b,
+                               weight_init=init.TruncatedNormal(sigma=std),
+                               bias_init='zeros',
+                               has_bias=bias)
+
+        # global FLOPS_COUNTER
+        # FLOPS_COUNTER += a * b
+
+    def construct(self, x: Tensor) -> Tensor:
+        x = self.bn1d(x)
+        x = self.linear(x)
+        return x
+
+
+class Residual(nn.Cell):
+    def __init__(self,
+                 m: type = None,
+                 drop: int = 0):
+        super().__init__()
+        self.m = m
+        self.drop = drop
+
+    def construct(self, x: Tensor) -> Tensor:
+        if self.training and self.drop > 0:
+            return x + self.m(x) * ms.Tensor.to_tensor(
+                (np.random.rand(x.shape[0], 1, 1) > self.drop) / (1 - self.drop))  # 可能修改
+        else:
+            y = self.m(x)
+            x = x + y
+            return x
+
+
+# def b16(n, activation=nn.HSwish, resolution=224):  # CNN 分块嵌入（Patch Embedding）  ops.HSwish
+#     return nn.SequentialCell(
+#         Conv2d_BN(3, n // 8, 3, 2, 1, resolution=resolution),
+#         activation(),
+#         Conv2d_BN(n // 8, n // 4, 3, 2, 1, resolution=resolution // 2),
+#         activation(),
+#         Conv2d_BN(n // 4, n // 2, 3, 2, 1, resolution=resolution // 4),
+#         activation(),
+#         Conv2d_BN(n // 2, n, 3, 2, 1, resolution=resolution // 8))
+
+
+class MLP(nn.Cell):  # CNN 分块嵌入（Patch Embedding）
+    def __init__(self,
+                 n: int,
+                 # activation: type = nn.HSwish(),
+                 resolution: int = 224) -> None:
+        super().__init__()
+
+        self.act = nn.HSwish()
+        self.cb1 = Conv2d_BN(3, n // 8, 3, 2, 1, resolution=resolution)
+        self.cb2 = Conv2d_BN(n // 8, n // 4, 3, 2, 1, resolution=resolution // 2)
+        self.cb3 = Conv2d_BN(n // 4, n // 2, 3, 2, 1, resolution=resolution // 4)
+        self.cb4 = Conv2d_BN(n // 2, n, 3, 2, 1, resolution=resolution // 8)
+
+    def construct(self, x: Tensor) -> Tensor:
+        x = self.cb1(x)
+        x = self.act(x)
+        x = self.cb2(x)
+        x = self.act(x)
+        x = self.cb3(x)
+        x = self.act(x)
+        x = self.cb4(x)
+        return x
+
+
+class Subsample(nn.Cell):  # 下采样
+    def __init__(self,
+                 stride: int,
+                 resolution: int):
+        super().__init__()
+        self.stride = stride
+        self.resolution = resolution
+
+    def construct(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        x = x.view(B, self.resolution, self.resolution, C)[
+            :, ::self.stride, ::self.stride].reshape(B, -1, C)
+        return x
+
+
+class Attention(nn.Cell):  # 注意力（Attention）
+    def __init__(self,
+                 dim: int,
+                 key_dim: int,
+                 num_heads: int = 8,
+                 attn_ratio: int = 4,
+                 activation: type = None,
+                 resolution: int = 14) -> None:
+
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+        h = self.dh + nh_kd * 2
+        self.qkv = Linear_BN(dim, h, resolution=resolution)
+        self.proj = nn.SequentialCell(activation(), Linear_BN(self.dh, dim, resolution=resolution))
+
+        points = list(itertools.product(range(resolution), range(resolution)))  # 迭代两个不同大小的列表来获取新列表
+        self.N = len(points)
+        # self.softmax = ms.ops.softmax(axis=-1) # change from nn.softmax to mindspore.ops.softmax
+
+        attention_offsets = {}
+        idxs = []
+        for p1 in points:
+            for p2 in points:
+                offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+
+        self.attention_biases = ms.Parameter(
+            Tensor(np.zeros([num_heads, len(attention_offsets)], np.float32)))
+
+        attention_bias_idxs = ms.Tensor(idxs, dtype=ms.int64).view(self.N, self.N)
+        self.attention_bias_idxs = ms.Parameter(attention_bias_idxs, requires_grad=False)
+
+        self.ab = self.attention_biases[:, self.attention_bias_idxs]
+
+        self.softmax = nn.Softmax(axis=-1)
+
+        # global FLOPS_COUNTER  # 可能去除
+        # # queries * keys
+        # FLOPS_COUNTER += num_heads * (resolution ** 4) * key_dim
+        # # softmax
+        # FLOPS_COUNTER += num_heads * (resolution ** 4)
+        # # attention * v
+        # FLOPS_COUNTER += num_heads * self.d * (resolution ** 4)
+
+    def construct(self,
+                  x: Tensor) -> Tensor:  # x (B,N,C)
+        B, N, C = x.shape
+        atte = self.qkv(x).view(B, N, self.num_heads, -1)
+        # atte_np = atte.asnumpy()
+        qkv = ms.numpy.split(atte,
+                             [self.key_dim,
+                              self.key_dim + self.key_dim],
+                             axis=3)
+        q = qkv[0]
+        k = qkv[1]
+        v = qkv[2]
+        # q = Tensor(q)
+        # k = Tensor(k)
+        # v = Tensor(v)
+
+        q = ops.transpose(q, (0, 2, 1, 3))
+        k = ops.transpose(k, (0, 2, 1, 3))
+        v = ops.transpose(v, (0, 2, 1, 3))
+
+        attn = (
+                (ops.matmul(q, ops.transpose(k, (0, 1, 3, 2)))) * self.scale
+                +
+                (self.attention_biases[:, self.attention_bias_idxs]
+                 if self.training else self.ab)
+        )
+
+        # attn = self.softmax(attn)
+        attn = self.softmax(attn)
+
+        x = ops.transpose((ops.matmul(attn, v)), (0, 2, 1, 3))
+
+        x = x.reshape(B, N, self.dh)
+
+        x = self.proj(x)
+
+        return x
+
+
+# AttentionSubsample：使用注意力机制的下采样层
+class AttentionSubsample(nn.Cell):
+    def __init__(self,
+                 in_dim: int,
+                 out_dim: int,
+                 key_dim: int,
+                 num_heads: int = 8,
+                 attn_ratio: int = 2,
+                 activation: type = None,
+                 stride: int = 2,
+                 resolution: int = 14,
+                 resolution_: int = 7) -> None:
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.scale = key_dim ** -0.5
+        self.key_dim = key_dim
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * self.num_heads
+        self.attn_ratio = attn_ratio
+        self.resolution_ = resolution_
+        self.resolution_2 = resolution_ ** 2
+        h = self.dh + nh_kd
+        self.kv = Linear_BN(in_dim, h, resolution=resolution)
+
+        self.q = nn.SequentialCell(
+            Subsample(stride, resolution),
+            Linear_BN(in_dim, nh_kd, resolution=resolution_))
+        self.proj = nn.SequentialCell(activation(), Linear_BN(self.dh, out_dim, resolution=resolution_))
+        #  self.proj = Linear_BN(self.dh, out_dim, resolution=resolution)
+        # self.softmax = ms.ops.softmax(axis=-1) # change from nn.softmax to mindspore.ops.softmax
+        self.stride = stride
+        self.resolution = resolution
+        points = list(itertools.product(range(resolution), range(resolution)))
+        points_ = list(itertools.product(range(resolution_), range(resolution_)))
+
+        N = len(points)
+        N_ = len(points_)
+        attention_offsets = {}
+        idxs = []
+        for p1 in points_:
+            for p2 in points:
+                size = 1
+                offset = (
+                    abs(p1[0] * stride - p2[0] + (size - 1) / 2),
+                    abs(p1[1] * stride - p2[1] + (size - 1) / 2))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+
+        self.attention_biases = Parameter(
+            Tensor(np.zeros([num_heads, len(attention_offsets)], np.float32)))
+
+        attention_bias_idxs = (ms.Tensor(idxs, dtype=ms.int64)).view((N_, N))
+        self.attention_bias_idxs = ms.Parameter(attention_bias_idxs, requires_grad=False)
+
+        self.ab = self.attention_biases[:, self.attention_bias_idxs]
+
+        self.softmax = nn.Softmax(axis=-1)
+
+        # global FLOPS_COUNTER
+        # # queries * keys
+        # FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2) * key_dim
+        # # softmax
+        # FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2)
+        # # attention * v
+        # FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2) * self.d
+
+    def construct(self,
+                  x: Tensor) -> Tensor:
+
+        B, N, C = x.shape
+        atte = self.kv(x).view(B, N, self.num_heads, -1)
+        # atte_np = atte.asnumpy()
+
+        kv = ms.numpy.split(atte, [self.key_dim], axis=3)
+        k = kv[0]
+        v = kv[1]
+        # k = Tensor(k)
+        # v = Tensor(v)
+        v = ops.transpose(v, (0, 2, 1, 3))
+        k = ops.transpose(k, (0, 2, 1, 3))
+
+        q = self.q(x).view(B, self.resolution_2, self.num_heads, self.key_dim)
+        q = ops.transpose(q, (0, 2, 1, 3))
+
+        attn = (
+                ops.matmul(q, ops.transpose(k, (0, 1, 3, 2))) * self.scale +
+                (self.attention_biases[:, self.attention_bias_idxs]
+                 if self.training else self.ab)
+        )
+
+        # attn = self.softmax(attn)
+        attn = self.softmax(attn)
+
+        x = ops.transpose((ops.matmul(attn, v)), (0, 2, 1, 3))
+        x = x.reshape(B, -1, self.dh)
+        x = self.proj(x)
+        return x
+
+
+class LeViT(nn.Cell):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+
+    def __init__(self,
+                 img_size: int = 224,
+                 patch_size: int = 16,
+                 in_channels: int = 3,
+                 num_classes: int = 1000,
+                 embed_dim: list = [128, 256, 384],
+                 key_dim: list = [16, 16, 16],
+                 depth: list = [2, 3, 4],
+                 num_heads: list = [4, 6, 8],
+                 attn_ratio: list = [2, 2, 2],
+                 mlp_ratio: list = [2, 2, 2],
+                 mlp_n: int = 128,
+                 down_ops: list = [['Subsample', 16, 128 // 16, 4, 2, 2], ['Subsample', 16, 256 // 16, 4, 2, 2]],
+                 attention_activation: type = nn.HSwish,
+                 mlp_activation: type = nn.HSwish,
+                 distillation: bool = False,
+                 drop_path: int = 0):
+        super().__init__()
+
+        # global FLOPS_COUNTER
+        self.num_classes = num_classes
+        self.num_features = embed_dim[-1]
+        self.embed_dim = embed_dim
+        self.distillation = distillation
+        self.patch_embed = MLP(mlp_n)
+        self.blocks = []
+        # print('mlp_activation', type(mlp_activation))
+
+        down_ops.append([''])
+        resolution = img_size // patch_size
+        for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate(
+                zip(embed_dim, key_dim, depth, num_heads, attn_ratio, mlp_ratio, down_ops)):
+            for _ in range(dpth):
+                self.blocks.append(
+                    Residual(Attention(
+                        ed, kd, nh,
+                        attn_ratio=ar,
+                        activation=attention_activation,
+                        resolution=resolution,
+                    ), drop_path))
+                if mr > 0:
+                    h = int(ed * mr)
+                    self.blocks.append(
+                        Residual(nn.SequentialCell(
+                            Linear_BN(ed, h, resolution=resolution),
+                            mlp_activation(),
+                            Linear_BN(h, ed,  # bn_weight_init=0,
+                                      resolution=resolution),
+                        ), drop_path))
+
+            if do[0] == 'Subsample':
+                resolution_ = (resolution - 1) // do[5] + 1
+                self.blocks.append(
+                    AttentionSubsample(
+                        *embed_dim[i:i + 2], key_dim=do[1], num_heads=do[2],
+                        attn_ratio=do[3],
+                        activation=attention_activation,
+                        stride=do[5],
+                        resolution=resolution,
+                        resolution_=resolution_))
+                resolution = resolution_
+                if do[4] > 0:  # mlp_ratio
+                    h = int(embed_dim[i + 1] * do[4])
+                    self.blocks.append(
+                        Residual(nn.SequentialCell(
+                            Linear_BN(embed_dim[i + 1], h,
+                                      resolution=resolution),
+                            mlp_activation(),
+                            Linear_BN(
+                                h, embed_dim[i + 1],  # bn_weight_init=0,
+                                resolution=resolution),
+                        ), drop_path))
+        self.blocks = nn.SequentialCell(*self.blocks)
+
+        # Classifier head
+        # if num_classes > 0:
+        self.head = BN_Linear(
+            embed_dim[-1], num_classes) if num_classes > 0 else nn.Identity()
+        if distillation:
+            self.head_dist = BN_Linear(
+                embed_dim[-1], num_classes) if num_classes > 0 else nn.Identity()
+
+        # self.FLOPS = FLOPS_COUNTER
+        # FLOPS_COUNTER = 0
+        # 　self._initialize_weights()
+
+        # def _initialize_weights(self) -> None:
+        #     for _, cell in self.cells_and_names():
+        #         if isinstance(cell, nn.Dense):
+        #             cell.weight.set_data(init.initializer(init.TruncatedNormal(sigma=.02), cell.weight.data.shape))
+        #             if cell.bias is not None:
+        #                 cell.bias.set_data(init.initializer(init.Constant(0), cell.bias.shape))
+        #         elif isinstance(cell, nn.LayerNorm):
+        #             cell.gamma.set_data(init.initializer(init.Constant(1.0), cell.gamma.shape))
+        #             cell.beta.set_data(init.initializer(init.Constant(0), cell.beta.shape))
+
+    def construct(self, x: Tensor) -> Tensor:
+
+        x = self.patch_embed(x)  # 问题?
+        # print('x1', x.shape)
+        B, C, H, W = x.shape
+        x = x.reshape(B, C, H * W)
+        # print('x.type:', type(x))
+
+        x = ops.transpose(x, (0, 2, 1))  # ？
+
+        # print('x2', x.shape)
+
+        x = self.blocks(x)  # 问题?
+
+        # print('x3', x.shape)
+        # if self.num_classes > 0:
+        #     x = x.mean(1)
+        #     if self.distillation:
+        #         x = self.head(x), self.head_dist(x)  # 问题？
+        #         x = (x[0] + x[1]) / 2
+        #     else:
+        #         x = self.head(x)
+        x = x.mean(1)
+        if self.distillation:
+            x = self.head(x), self.head_dist(x)  # 问题？
+            if not self.training:
+                x = (x[0] + x[1]) / 2
+        else:
+            x = self.head(x)
+        return x
+
+
+@register_model
+def LeViT_128S(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
+    default_cfg = default_cfgs['LeViT_128S']
+    model = LeViT(in_channels=in_channels, num_classes=num_classes,
+                  embed_dim=[128, 256, 384],
+                  num_heads=[4, 6, 8],
+                  key_dim=[16, 16, 16],
+                  depth=[2, 3, 4],
+                  down_ops=[
+                      ['Subsample', 16, 128 // 16, 4, 2, 2],
+                      ['Subsample', 16, 256 // 16, 4, 2, 2],
+                  ],
+                  mlp_n=128,
+                  distillation=True,
+                  **kwargs)
+    model.default_cfg = default_cfg
+
+    # checkpoint_path = "/home/work/user-job-dir/V0076/LeViT-128S.ckpt"
+    # param_dict = load_checkpoint(checkpoint_path)
+    # load_param_into_net(model, param_dict)
+
+    # load_checkpoint(checkpoint_path, model)
+
+    # print("****************************print parameter*********************************")
+    # mindspore_params(model)
+
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
+
+    return model
+
+
+@register_model
+def LeViT_128(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
+    default_cfg = default_cfgs['LeViT_128']
+    model = LeViT(in_channels=in_channels, num_classes=num_classes,
+                  embed_dim=[128, 256, 384],
+                  num_heads=[4, 8, 12],
+                  key_dim=[16, 16, 16],
+                  depth=[4, 4, 4],
+                  down_ops=[
+                      ['Subsample', 16, 128 // 16, 4, 2, 2],
+                      ['Subsample', 16, 256 // 16, 4, 2, 2],
+                  ],
+                  mlp_n=128,
+                  distillation=False,
+                  **kwargs)
+    model.default_cfg = default_cfg
+
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
+
+    return model
+
+
+@register_model
+def LeViT_192(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
+    default_cfg = default_cfgs['LeViT_192']
+    model = LeViT(in_channels=in_channels, num_classes=num_classes,
+                  embed_dim=[192, 288, 384],
+                  num_heads=[3, 5, 6],
+                  key_dim=[32, 32, 32],
+                  depth=[4, 4, 4],
+                  down_ops=[
+                      ['Subsample', 32, 192 // 32, 4, 2, 2],
+                      ['Subsample', 32, 288 // 32, 4, 2, 2],
+                  ],
+                  mlp_n=192,
+                  distillation=False,
+                  **kwargs)
+    model.default_cfg = default_cfg
+
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
+
+    return model
+
+
+@register_model
+def LeViT_256(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
+    default_cfg = default_cfgs['LeViT_256']
+    model = LeViT(in_channels=in_channels, num_classes=num_classes,
+                  embed_dim=[256, 384, 512],
+                  num_heads=[4, 6, 8],
+                  key_dim=[32, 32, 32],
+                  depth=[4, 4, 4],
+                  down_ops=[
+                      ['Subsample', 32, 256 // 32, 4, 2, 2],
+                      ['Subsample', 32, 384 // 32, 4, 2, 2],
+                  ],
+                  mlp_n=256,
+                  distillation=False,
+                  **kwargs)
+    model.default_cfg = default_cfg
+
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
+
+    return model
+
+
+@register_model
+def LeViT_384(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT:
+    default_cfg = default_cfgs['LeViT_384']
+    model = LeViT(in_channels=in_channels, num_classes=num_classes,
+                  embed_dim=[384, 512, 768],
+                  num_heads=[6, 9, 12],
+                  key_dim=[32, 32, 32],
+                  depth=[4, 4, 4],
+                  down_ops=[
+                      ['Subsample', 32, 384 // 32, 4, 2, 2],
+                      ['Subsample', 32, 512 // 32, 4, 2, 2],
+                  ],
+                  mlp_n=384,
+                  distillation=False,
+                  **kwargs)
+    model.default_cfg = default_cfg
+
+    if pretrained:
+        load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels)
+
+    return model
+
+# if __name__ == '__main__':
+#     import numpy as np
+#     import mindspore
+#     from mindspore import Tensor
+#     from mindspore import context
+#
+#     context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
+#     # context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+#
+#     model = LeViT_128S()
+#     print(model)
+#     dummy_input = Tensor(np.random.rand(4, 3, 224, 224), dtype=mindspore.float32)
+#     y = model(dummy_input)
+#     print(y.shape)
diff --git a/mindcv/utils/__init__.py b/mindcv/utils/__init__.py
index 39b346e0481a2d82ede9049cde85edae19de6848..d2462bf3cf8e41fbb0be9c2db922d134e3c4ec1f 100644
--- a/mindcv/utils/__init__.py
+++ b/mindcv/utils/__init__.py
@@ -8,4 +8,4 @@ from .path import *
 from .random import *
 from .reduce_manager import *
 from .train_step import *
-from .trainer_factory import *
+from .trainer_factory_distillation import *
diff --git a/mindcv/utils/get_train_one_step.py b/mindcv/utils/get_train_one_step.py
new file mode 100644
index 0000000000000000000000000000000000000000..795889876dcb523203e1c7a60d9a448cc4182154
--- /dev/null
+++ b/mindcv/utils/get_train_one_step.py
@@ -0,0 +1,223 @@
+import mindspore as ms
+from mindspore import nn
+from mindspore import Tensor
+from mindspore import dtype as mstype
+from mindspore.common import RowTensor
+from mindspore.ops import composite as C
+from mindspore.ops import functional as F
+from mindspore.ops import operations as P
+
+# from train_one_step_with_ema import TrainOneStepWithEMA
+# from train_one_step_with_scale_and_clip_global_norm import \
+#     TrainOneStepWithLossScaleCellGlobalNormClip
+
+"""TrainOneStepWithEMA"""
+
+_ema_op = C.MultitypeFuncGraph("grad_ema_op")
+assign = P.Assign()
+assign_add = P.AssignAdd()
+
+
+@_ema_op.register("Tensor", "Tensor", "Tensor")
+def _ema_weights(factor, ema_weight, weight):
+    """Apply grad sum to cumulative gradient."""
+    return assign_add(ema_weight, ema_weight * factor + weight * (1 - factor))
+
+
+class EMACell(nn.Cell):
+    """EMACell Define"""
+
+    def __init__(self, weights, ema_decay=0.9999):
+        super(EMACell, self).__init__()
+        self.ema_weights = weights.clone(prefix="_ema_weights")
+        self.ema_decay = Tensor(ema_decay, mstype.float32)
+        self.hyper_map = C.HyperMap()
+
+    def construct(self, *inputs, **kwargs):
+        weights = inputs[0]
+        success = self.hyper_map(
+            F.partial(_ema_op, self.ema_decay), self.ema_weights, weights
+        )
+        return success
+
+
+_grad_scale = C.MultitypeFuncGraph("grad_scale")
+reciprocal = P.Reciprocal()
+
+
+@_grad_scale.register("Tensor", "Tensor")
+def tensor_grad_scale(scale, grad):
+    return grad * F.cast(reciprocal(scale), F.dtype(grad))
+
+
+@_grad_scale.register("Tensor", "RowTensor")
+def tensor_grad_scale_row_tensor(scale, grad):
+    return RowTensor(grad.indices,
+                     grad.values * F.cast(reciprocal(scale), F.dtype(grad.values)),
+                     grad.dense_shape)
+
+
+_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
+grad_overflow = P.FloatStatus()
+
+
+class TrainOneStepWithEMA(nn.TrainOneStepWithLossScaleCell):
+    """TrainOneStepWithEMA"""
+
+    def __init__(self, network, optimizer, scale_sense=1.0, with_ema=False, ema_decay=0.9999):
+        super(TrainOneStepWithEMA, self).__init__(network, optimizer, scale_sense)
+        self.print = P.Print()
+        self.with_ema = with_ema
+        if self.with_ema:
+            self.ema_model = EMACell(self.weights, ema_decay=ema_decay)
+
+    def construct(self, *inputs):
+        """construct"""
+        weights = self.weights
+        loss = self.network(*inputs)
+        scaling_sens = self.scale_sense
+
+        status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
+
+        scaling_sens_filled = C.ones_like(loss) * F.cast(scaling_sens, F.dtype(loss))
+        grads = self.grad(self.network, weights)(*inputs, scaling_sens_filled)
+        grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads)
+        # apply grad reducer on grads
+        grads = self.grad_reducer(grads)
+        # get the overflow buffer
+        cond = self.get_overflow_status(status, grads)
+        overflow = self.process_loss_scale(cond)
+        # if there is no overflow, do optimize
+        if not overflow:
+            loss = F.depend(loss, self.optimizer(grads))
+            if self.with_ema:
+                self.ema_model(self.weights)
+        else:
+            self.print("=============Over Flow, skipping=============")
+        return loss
+
+
+"""TrainOneStepWithLossScaleCellGlobalNormClip"""
+
+_grad_scale = C.MultitypeFuncGraph("grad_scale")
+reciprocal = P.Reciprocal()
+
+
+@_grad_scale.register("Tensor", "Tensor")
+def tensor_grad_scale(scale, grad):
+    return grad * F.cast(reciprocal(scale), F.dtype(grad))
+
+
+@_grad_scale.register("Tensor", "RowTensor")
+def tensor_grad_scale_row_tensor(scale, grad):
+    return RowTensor(grad.indices,
+                     grad.values * F.cast(reciprocal(scale), F.dtype(grad.values)),
+                     grad.dense_shape)
+
+
+_grad_overflow = C.MultitypeFuncGraph("_grad_overflow")
+grad_overflow = P.FloatStatus()
+
+
+class TrainOneStepWithLossScaleCellGlobalNormClip(
+    nn.TrainOneStepWithLossScaleCell
+):
+    """
+    Encapsulation class of SSD network training.
+
+    Append an optimizer to the training network after that the construct
+    function can be called to create the backward graph.
+
+    Args:
+        network (Cell): The training network. Note that loss function should have been added.
+        optimizer (Optimizer): Optimizer for updating the weights.
+        sens (Number): The adjust parameter. Default: 1.0.
+        use_global_nrom(bool): Whether apply global norm before optimizer. Default: False
+    """
+
+    def __init__(self,
+                 network,
+                 optimizer,
+                 scale_sense=1.0,
+                 use_global_norm=True,
+                 clip_global_norm_value=1.0):
+        super(TrainOneStepWithLossScaleCellGlobalNormClip, self).__init__(network, optimizer, scale_sense)
+        # self.use_global_norm = use_global_norm
+        self.clip_global_norm_value = clip_global_norm_value
+        self.print = P.Print()
+
+    def construct(self, *inputs):
+        """construct"""
+        weights = self.weights
+        loss = self.network(*inputs)
+        scaling_sens = self.scale_sense
+
+        status, scaling_sens = self.start_overflow_check(loss, scaling_sens)
+
+        scaling_sens_filled = C.ones_like(loss) * F.cast(scaling_sens, F.dtype(loss))
+        grads = self.grad(self.network, weights)(*inputs, scaling_sens_filled)
+        grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads)
+        # apply grad reducer on grads
+        grads = self.grad_reducer(grads)
+        # get the overflow buffer
+        cond = self.get_overflow_status(status, grads)
+        overflow = self.process_loss_scale(cond)
+        # if there is no overflow, do optimize
+        if not overflow:
+            if self.use_global_norm:
+                grads = C.clip_by_global_norm(grads, clip_norm=self.clip_global_norm_value)
+            self.optimizer(grads)
+        else:
+            self.print("=============Over Flow, skipping=============")
+        return loss
+
+
+def get_train_one_step(
+                       network,
+                       optimizer,
+                       ema,
+                       ema_decay,
+                       clip_grad,
+                       clip_value,
+                       gradient_accumulation_steps,
+                       scale_sense):
+    """get_train_one_step cell"""
+    # if args.loss_scale_type == 'dynamic':
+    #     print(f"=> Using DynamicLossScaleUpdateCell")
+    #     scale_sense = nn.wrap.loss_scale.DynamicLossScaleUpdateCell(
+    #         loss_scale_value=2 ** 24, scale_factor=2, scale_window=2000
+    #     )
+    # else:
+    #     print(
+    #         "=> Using FixedLossScaleUpdateCell, "
+    #         f"loss_scale_value:{args.loss_scale}"
+    #     )
+    #     scale_sense = nn.wrap.FixedLossScaleUpdateCell(  # 执行
+    #         loss_scale_value=args.loss_scale
+    #     )
+    if ema:
+        print(f"=> Using EMA. ema_decay: {ema_decay}")
+        network = TrainOneStepWithEMA(  # 执行
+            network=network,
+            optimizer=optimizer,
+            scale_sense=scale_sense,
+            with_ema=ema,
+            ema_decay=ema_decay)
+    elif clip_grad:
+        print(
+            "=> Using gradient clipping by norm, clip_value: "
+            f"{clip_value}"
+        )
+        network = TrainOneStepWithLossScaleCellGlobalNormClip(
+            network,
+            optimizer,
+            scale_sense,
+            # use_global_norm=args.clip_grad_norm,
+            clip_global_norm_value=clip_value
+        )
+    else:
+        print("=> Use simple loss scale.")
+        network = nn.TrainOneStepWithLossScaleCell(
+            network, optimizer, scale_sense=scale_sense
+        )
+    return network
diff --git a/mindcv/utils/trainer_factory.py b/mindcv/utils/trainer_factory.py
index db47a48e665d8f2061b7dbdffc67fe9dfef17373..cb92651829d5097f7d632bb756b364cc844ae321 100644
--- a/mindcv/utils/trainer_factory.py
+++ b/mindcv/utils/trainer_factory.py
@@ -34,10 +34,10 @@ def get_metrics(num_classes):
 
 
 def require_customized_train_step(
-    ema: bool = False,
-    clip_grad: bool = False,
-    gradient_accumulation_steps: int = 1,
-    amp_cast_list: Optional[str] = None,
+        ema: bool = False,
+        clip_grad: bool = False,
+        gradient_accumulation_steps: int = 1,
+        amp_cast_list: Optional[str] = None,
 ):
     if ema:
         return True
@@ -74,20 +74,20 @@ def add_loss_network(network, loss_fn, amp_level):
 
 
 def create_trainer(
-    network: nn.Cell,
-    loss: nn.Cell,
-    optimizer: nn.Cell,
-    metrics: Union[dict, set],
-    amp_level: str,
-    amp_cast_list: str,
-    loss_scale_type: str,
-    loss_scale: float = 1.0,
-    drop_overflow_update: bool = False,
-    ema: bool = False,
-    ema_decay: float = 0.9999,
-    clip_grad: bool = False,
-    clip_value: float = 15.0,
-    gradient_accumulation_steps: int = 1,
+        network: nn.Cell,
+        loss: nn.Cell,
+        optimizer: nn.Cell,
+        metrics: Union[dict, set],
+        amp_level: str,
+        amp_cast_list: str,
+        loss_scale_type: str,
+        loss_scale: float = 1.0,
+        drop_overflow_update: bool = False,
+        ema: bool = False,
+        ema_decay: float = 0.9999,
+        clip_grad: bool = False,
+        clip_value: float = 15.0,
+        gradient_accumulation_steps: int = 1,
 ):
     """Create Trainer.
 
diff --git a/mindcv/utils/trainer_factory_distillation.py b/mindcv/utils/trainer_factory_distillation.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a03a15abfa0f1cabb0301156fa7a118afff351
--- /dev/null
+++ b/mindcv/utils/trainer_factory_distillation.py
@@ -0,0 +1,221 @@
+import logging
+from typing import Optional, Union
+
+import mindspore as ms
+from mindspore import Tensor, context
+from mindspore import dtype as mstype
+from mindspore import nn
+from mindspore.ops import functional as F
+from mindspore.train import DynamicLossScaleManager, FixedLossScaleManager, Model
+
+from .amp import auto_mixed_precision
+from .train_step import TrainStep
+from .get_train_one_step import get_train_one_step
+
+__all__ = [
+    "get_metrics",
+    "require_customized_train_step",
+    "create_trainer",
+]
+
+_logger = logging.getLogger(__name__)
+
+
+def get_metrics(num_classes):
+    if num_classes >= 5:
+        metrics = {
+            "Top_1_Accuracy": nn.Top1CategoricalAccuracy(),
+            "Top_5_Accuracy": nn.Top5CategoricalAccuracy(),
+        }
+    else:
+        metrics = {
+            "Top_1_Accuracy": nn.Top1CategoricalAccuracy(),
+        }
+    return metrics
+
+
+def require_customized_train_step(
+        ema: bool = False,
+        clip_grad: bool = False,
+        gradient_accumulation_steps: int = 1,
+        amp_cast_list: Optional[str] = None,
+):
+    if ema:
+        return True
+    if clip_grad:
+        return True
+    if gradient_accumulation_steps > 1:
+        return True
+    if amp_cast_list:
+        return True
+    return False
+
+
+def add_loss_network(network, loss_fn, amp_level):
+    """Add loss network."""
+
+    class WithLossCell(nn.Cell):
+        "Wrap loss for amp. Cast network output back to float32"
+
+        def __init__(self, backbone, loss_fn):
+            super(WithLossCell, self).__init__(auto_prefix=False)
+            self._backbone = backbone
+            self._loss_fn = loss_fn
+
+        def construct(self, data, label):
+            out = self._backbone(data)
+            label = F.mixed_precision_cast(mstype.float32, label)
+            return self._loss_fn(F.mixed_precision_cast(mstype.float32, out), label)
+
+    if amp_level == "O2" or amp_level == "O3":
+        network = WithLossCell(network, loss_fn)
+    else:
+        network = nn.WithLossCell(network, loss_fn)
+    return network
+
+
+class NetWithLoss(nn.Cell):
+    """
+    NetWithLoss: Only support Network with Classification.
+    """
+
+    def __init__(self, model, criterion):
+        super(NetWithLoss, self).__init__()
+        self.model = model
+        self.criterion = criterion
+
+    def construct(self, *inputs, **kwargs):
+        data = inputs[0]
+        label = inputs[1]
+        predict = self.model(data)
+        loss = self.criterion(data, predict, label)
+        return loss
+
+
+def create_trainer(
+        network: nn.Cell,
+        loss: nn.Cell,
+        criterion: nn.Cell,
+        optimizer: nn.Cell,
+        metrics: Union[dict, set],
+        amp_level: str,
+        amp_cast_list: str,
+        loss_scale_type: str,
+        loss_scale: float = 1.0,
+        drop_overflow_update: bool = False,
+        ema: bool = False,
+        ema_decay: float = 0.9999,
+        clip_grad: bool = False,
+        clip_value: float = 15.0,
+        gradient_accumulation_steps: int = 1,
+):
+    """Create Trainer.
+
+    Args:
+        network: The backbone network to train, evaluate or predict.
+        loss: The function of eval_network loss.
+        criterion: The function of calculating loss,
+        optimizer: The optimizer for training.
+        metrics: The metrics for model evaluation.
+        amp_level: The level of auto mixing precision training.
+        amp_cast_list: At the cell level, custom casting the cell to FP16.
+        loss_scale_type: The type of loss scale.
+        loss_scale: The value of loss scale.
+        drop_overflow_update: Whether to execute optimizer if there is an overflow.
+        ema: Whether to use exponential moving average of model weights.
+        ema_decay: Decay factor for model weights moving average.
+        clip_grad: whether to gradient clip.
+        clip_value: The value at which to clip gradients.
+        gradient_accumulation_steps: Accumulate the gradients of n batches before update.
+
+    Returns:
+        mindspore.Model
+
+    """
+    if loss_scale < 1.0:
+        raise ValueError("Loss scale cannot be less than 1.0!")
+
+    if drop_overflow_update is False and loss_scale_type.lower() == "dynamic":
+        raise ValueError("DynamicLossScale ALWAYS drop overflow!")
+
+    if gradient_accumulation_steps < 1:
+        raise ValueError("`gradient_accumulation_steps` must be >= 1!")
+
+    if not require_customized_train_step(ema, clip_grad, gradient_accumulation_steps, amp_cast_list):
+        net_with_loss = NetWithLoss(network, criterion)
+        eval_network = nn.WithEvalCell(network, loss, amp_level in ["O2", "O3", "auto"])
+        eval_indexes = [0, 1, 2]
+
+        mindspore_kwargs = dict(
+            network=net_with_loss,
+            # loss_fn=loss,
+            optimizer=optimizer,
+            metrics=metrics,
+            amp_level=amp_level,
+            eval_network=eval_network,
+            eval_indexes=eval_indexes,
+        )
+        if loss_scale_type.lower() == "fixed":
+            mindspore_kwargs["loss_scale_manager"] = FixedLossScaleManager(
+                loss_scale=loss_scale, drop_overflow_update=drop_overflow_update
+            )
+        elif loss_scale_type.lower() == "dynamic":
+            mindspore_kwargs["loss_scale_manager"] = DynamicLossScaleManager(
+                init_loss_scale=loss_scale, scale_factor=2, scale_window=2000
+            )
+        elif loss_scale_type.lower() == "auto":
+            # We don't explicitly construct LossScaleManager
+            _logger.warning(
+                "You are using AUTO loss scale, which means the LossScaleManager isn't explicitly pass in "
+                "when creating a mindspore.Model instance. "
+                "NOTE: mindspore.Model may use LossScaleManager silently. See mindspore.train.amp for details."
+            )
+        else:
+            raise ValueError(f"Loss scale type only support ['fixed', 'dynamic', 'auto'], but got{loss_scale_type}.")
+        model = Model(**mindspore_kwargs)
+    else:  # require customized train step
+        eval_network = nn.WithEvalCell(network, loss, amp_level in ["O2", "O3", "auto"])  # loss=ce
+        auto_mixed_precision(network, amp_level, amp_cast_list)
+        # net_with_loss = add_loss_network(network, loss, amp_level)
+        # criterion = get_criterion_by_args(args)
+        net_with_loss = NetWithLoss(network, criterion)  # add_loss_network
+
+        train_step_kwargs = dict(
+            network=net_with_loss,
+            optimizer=optimizer,
+            ema=ema,
+            ema_decay=ema_decay,
+            clip_grad=clip_grad,
+            clip_value=clip_value,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+        )
+        if loss_scale_type.lower() == "fixed":
+            loss_scale_manager = FixedLossScaleManager(loss_scale=loss_scale,
+                                                       drop_overflow_update=drop_overflow_update)  # scale_sense
+        elif loss_scale_type.lower() == "dynamic":
+            loss_scale_manager = DynamicLossScaleManager(init_loss_scale=loss_scale, scale_factor=2, scale_window=2000)
+        else:
+            raise ValueError(f"Loss scale type only support ['fixed', 'dynamic'], but got{loss_scale_type}.")
+        update_cell = loss_scale_manager.get_update_cell()
+        # 1. loss_scale_type="fixed", drop_overflow_update=False
+        # --> update_cell=None, TrainStep=TrainOneStepCell(scale_sense=loss_scale)
+        # 2. loss_scale_type: fixed, drop_overflow_update: True
+        # --> update_cell=FixedLossScaleUpdateCell, TrainStep=TrainOneStepWithLossScaleCell(scale_sense=update_cell)
+        # 3. loss_scale_type: dynamic, drop_overflow_update: True
+        # --> update_cell=DynamicLossScaleUpdateCell, TrainStep=TrainOneStepWithLossScaleCell(scale_sense=update_cell)
+        if update_cell is None:
+            train_step_kwargs["scale_sense"] = Tensor(loss_scale, dtype=ms.float32)
+        else:
+            if not context.get_context("enable_ge") and context.get_context("device_target") == "CPU":
+                raise ValueError(
+                    "Only `loss_scale_type` is `fixed` and `drop_overflow_update` is `False`"
+                    "are supported on device `CPU`."
+                )
+            train_step_kwargs["scale_sense"] = update_cell
+        # train_step_cell = TrainStep(**train_step_kwargs).set_train()
+
+        train_step_cell = get_train_one_step(**train_step_kwargs).set_train()
+        model = Model(train_step_cell, eval_network=eval_network, metrics=metrics, eval_indexes=[0, 1, 2])
+        #
+        # todo: do we need to set model._loss_scale_manager
+    return model
diff --git a/train_zhisuan.py b/train_zhisuan.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eee78f51cec3ba5d4ea8d5281735d3ddcfa2583
--- /dev/null
+++ b/train_zhisuan.py
@@ -0,0 +1,434 @@
+""" Model training pipeline """
+import logging
+import os
+import numpy as np
+import moxing as mox
+import time
+import json
+
+import mindspore as ms
+from mindspore import FixedLossScaleManager, Model, Tensor, nn, context
+from mindspore.communication import get_group_size, get_rank, init
+
+from mindspore.context import ParallelMode
+
+from mindcv.data import create_dataset, create_loader, create_transforms
+from mindcv.loss import create_loss
+from mindcv.models import create_model
+from mindcv.optim import create_optimizer
+from mindcv.scheduler import create_scheduler
+from mindcv.utils import (
+    AllReduceSum,
+    StateMonitor,
+    # create_trainer,
+    get_metrics,
+    require_customized_train_step,
+    set_logger,
+    set_seed,
+)
+from mindcv.utils.trainer_factory import create_trainer
+from config import parse_args, save_args  # isort: skip
+
+logger = logging.getLogger("mindcv.train")
+
+
+def train(args):
+    """main train function"""
+
+    ms.set_context(mode=args.mode)
+    if args.distribute:
+        init()
+        device_num = get_group_size()
+        rank_id = get_rank()
+        ms.set_auto_parallel_context(
+            device_num=device_num,
+            parallel_mode="data_parallel",
+            gradients_mean=True,
+            # we should but cannot set parameter_broadcast=True, which will cause error on gpu.
+        )
+    else:
+        device_num = None
+        rank_id = None
+
+    set_seed(args.seed)
+    set_logger(name="mindcv", output_dir=args.ckpt_save_dir, rank=rank_id, color=False)
+    logger.info(
+        "We recommend installing `termcolor` via `pip install termcolor` "
+        "and setup logger by `set_logger(..., color=True)`"
+    )
+
+    # create dataset
+    dataset_train = create_dataset(
+        name=args.dataset,
+        root=args.data_dir,
+        split=args.train_split,
+        shuffle=args.shuffle,
+        num_samples=args.num_samples,
+        num_shards=device_num,
+        shard_id=rank_id,
+        num_parallel_workers=args.num_parallel_workers,
+        download=args.dataset_download,
+        num_aug_repeats=args.aug_repeats,
+    )
+
+    if args.num_classes is None:
+        num_classes = dataset_train.num_classes()
+    else:
+        num_classes = args.num_classes
+
+    # create transforms
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits == 3, "Currently, only support 3 splits of augmentation"
+        assert args.auto_augment is not None, "aug_splits should be set with one auto_augment"
+        num_aug_splits = args.aug_splits
+
+    transform_list = create_transforms(
+        dataset_name=args.dataset,
+        is_training=True,
+        image_resize=args.image_resize,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        interpolation=args.interpolation,
+        auto_augment=args.auto_augment,
+        mean=args.mean,
+        std=args.std,
+        re_prob=args.re_prob,
+        re_scale=args.re_scale,
+        re_ratio=args.re_ratio,
+        re_value=args.re_value,
+        re_max_attempts=args.re_max_attempts,
+        separate=num_aug_splits > 0,
+    )
+
+    # load dataset
+    loader_train = create_loader(
+        dataset=dataset_train,
+        batch_size=args.batch_size,
+        drop_remainder=args.drop_remainder,
+        is_training=True,
+        mixup=args.mixup,
+        cutmix=args.cutmix,
+        cutmix_prob=args.cutmix_prob,
+        num_classes=num_classes,
+        transform=transform_list,
+        num_parallel_workers=args.num_parallel_workers,
+        separate=num_aug_splits > 0,
+    )
+
+    if args.val_while_train:
+        dataset_eval = create_dataset(
+            name=args.dataset,
+            root=args.data_dir,
+            split=args.val_split,
+            num_shards=device_num,
+            shard_id=rank_id,
+            num_parallel_workers=args.num_parallel_workers,
+            download=args.dataset_download,
+        )
+
+        transform_list_eval = create_transforms(
+            dataset_name=args.dataset,
+            is_training=False,
+            image_resize=args.image_resize,
+            crop_pct=args.crop_pct,
+            interpolation=args.interpolation,
+            mean=args.mean,
+            std=args.std,
+        )
+
+        loader_eval = create_loader(
+            dataset=dataset_eval,
+            batch_size=args.batch_size,
+            drop_remainder=False,
+            is_training=False,
+            transform=transform_list_eval,
+            num_parallel_workers=args.num_parallel_workers,
+        )
+        # validation dataset count
+        eval_count = dataset_eval.get_dataset_size()
+        if args.distribute:
+            all_reduce = AllReduceSum()
+            eval_count = all_reduce(Tensor(eval_count, ms.int32))
+    else:
+        loader_eval = None
+        eval_count = None
+
+    num_batches = loader_train.get_dataset_size()
+    # Train dataset count
+    train_count = dataset_train.get_dataset_size()
+    if args.distribute:
+        all_reduce = AllReduceSum()
+        train_count = all_reduce(Tensor(train_count, ms.int32))
+
+    # create model
+    network = create_model(
+        model_name=args.model,
+        num_classes=num_classes,
+        in_channels=args.in_channels,
+        drop_rate=args.drop_rate,
+        drop_path_rate=args.drop_path_rate,
+        pretrained=args.pretrained,
+        checkpoint_path=args.ckpt_path,
+        ema=args.ema,
+    )
+
+    num_params = sum([param.size for param in network.get_parameters()])
+
+    # create loss
+    loss = create_loss(
+        name=args.loss,
+        reduction=args.reduction,
+        label_smoothing=args.label_smoothing,
+        aux_factor=args.aux_factor,
+    )
+
+    # create learning rate schedule
+    lr_scheduler = create_scheduler(
+        num_batches,
+        scheduler=args.scheduler,
+        lr=args.lr,
+        min_lr=args.min_lr,
+        warmup_epochs=args.warmup_epochs,
+        warmup_factor=args.warmup_factor,
+        decay_epochs=args.decay_epochs,
+        decay_rate=args.decay_rate,
+        milestones=args.multi_step_decay_milestones,
+        num_epochs=args.epoch_size,
+        num_cycles=args.num_cycles,
+        cycle_decay=args.cycle_decay,
+        lr_epoch_stair=args.lr_epoch_stair,
+    )
+
+    # resume training if ckpt_path is given
+    if args.ckpt_path != "" and args.resume_opt:
+        opt_ckpt_path = os.path.join(args.ckpt_save_dir, f"optim_{args.model}.ckpt")
+    else:
+        opt_ckpt_path = ""
+
+    # create optimizer
+    # TODO: consistent naming opt, name, dataset_name
+    if (
+        args.loss_scale_type == "fixed"
+        and args.drop_overflow_update is False
+        and not require_customized_train_step(
+            args.ema,
+            args.clip_grad,
+            args.gradient_accumulation_steps,
+            args.amp_cast_list,
+        )
+    ):
+        optimizer_loss_scale = args.loss_scale
+    else:
+        optimizer_loss_scale = 1.0
+    optimizer = create_optimizer(
+        network.trainable_params(),
+        opt=args.opt,
+        lr=lr_scheduler,
+        weight_decay=args.weight_decay,
+        momentum=args.momentum,
+        nesterov=args.use_nesterov,
+        filter_bias_and_bn=args.filter_bias_and_bn,
+        loss_scale=optimizer_loss_scale,
+        checkpoint_path=opt_ckpt_path,
+        eps=args.eps,
+    )
+
+    # Define eval metrics.
+    metrics = get_metrics(num_classes)
+
+    # create trainer
+    trainer = create_trainer(
+        network,
+        loss,
+        optimizer,
+        metrics,
+        amp_level=args.amp_level,
+        amp_cast_list=args.amp_cast_list,
+        loss_scale_type=args.loss_scale_type,
+        loss_scale=args.loss_scale,
+        drop_overflow_update=args.drop_overflow_update,
+        ema=args.ema,
+        ema_decay=args.ema_decay,
+        clip_grad=args.clip_grad,
+        clip_value=args.clip_value,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+    )
+
+    # callback
+    # save checkpoint, summary training loss
+    # record val acc and do model selection if val dataset is available
+    begin_step = 0
+    begin_epoch = 0
+    if args.ckpt_path != "":
+        begin_step = optimizer.global_step.asnumpy()[0]
+        begin_epoch = args.ckpt_path.split("/")[-1].split("-")[1].split("_")[0]
+        begin_epoch = int(begin_epoch)
+
+    summary_dir = f"./{args.ckpt_save_dir}/summary"
+    assert (
+        args.ckpt_save_policy != "top_k" or args.val_while_train is True
+    ), "ckpt_save_policy is top_k, val_while_train must be True."
+    state_cb = StateMonitor(
+        trainer,
+        model_name=args.model,
+        model_ema=args.ema,
+        last_epoch=begin_epoch,
+        dataset_sink_mode=args.dataset_sink_mode,
+        dataset_val=loader_eval,
+        metric_name=list(metrics.keys()),
+        val_interval=args.val_interval,
+        ckpt_save_dir=args.ckpt_save_dir,
+        ckpt_save_interval=args.ckpt_save_interval,
+        ckpt_save_policy=args.ckpt_save_policy,
+        ckpt_keep_max=args.keep_checkpoint_max,
+        summary_dir=summary_dir,
+        log_interval=args.log_interval,
+        rank_id=rank_id,
+        device_num=device_num,
+    )
+
+    callbacks = [state_cb]
+    essential_cfg_msg = "\n".join(
+        [
+            "Essential Experiment Configurations:",
+            f"MindSpore mode[GRAPH(0)/PYNATIVE(1)]: {args.mode}",
+            f"Distributed mode: {args.distribute}",
+            f"Number of devices: {device_num if device_num is not None else 1}",
+            f"Number of training samples: {train_count}",
+            f"Number of validation samples: {eval_count}",
+            f"Number of classes: {num_classes}",
+            f"Number of batches: {num_batches}",
+            f"Batch size: {args.batch_size}",
+            f"Auto augment: {args.auto_augment}",
+            f"MixUp: {args.mixup}",
+            f"CutMix: {args.cutmix}",
+            f"Model: {args.model}",
+            f"Model parameters: {num_params}",
+            f"Number of epochs: {args.epoch_size}",
+            f"Optimizer: {args.opt}",
+            f"Learning rate: {args.lr}",
+            f"LR Scheduler: {args.scheduler}",
+            f"Momentum: {args.momentum}",
+            f"Weight decay: {args.weight_decay}",
+            f"Auto mixed precision: {args.amp_level}",
+            f"Loss scale: {args.loss_scale}({args.loss_scale_type})",
+        ]
+    )
+    logger.info(essential_cfg_msg)
+    save_args(args, os.path.join(args.ckpt_save_dir, f"{args.model}.yaml"), rank_id)
+
+    if args.ckpt_path != "":
+        logger.info(f"Resume training from {args.ckpt_path}, last step: {begin_step}, last epoch: {begin_epoch}")
+    else:
+        logger.info("Start training")
+
+    trainer.train(args.epoch_size, loader_train, callbacks=callbacks, dataset_sink_mode=args.dataset_sink_mode)
+
+def C2netMultiObsToEnv(multi_data_url, data_dir):
+    #--multi_data_url is json data, need to do json parsing for multi_data_url
+    print(multi_data_url)
+    multi_data_json = json.loads(multi_data_url)  
+    for i in range(len(multi_data_json)):
+        zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"]
+        try:
+            mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) 
+            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],zipfile_path))
+            #get filename and unzip the dataset
+            filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0]
+            filePath = data_dir + "/" + filename
+            if not os.path.exists(filePath):
+                os.makedirs(filePath)
+            os.system("unzip {} -d {}".format(zipfile_path, filePath))
+
+        except Exception as e:
+            print('moxing download {} to {} failed: '.format(
+                multi_data_json[i]["dataset_url"], zipfile_path) + str(e))
+    #Set a cache file to determine whether the data has been copied to obs. 
+    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
+    f = open("/cache/download_input.txt", 'w')    
+    f.close()
+    try:
+        if os.path.exists("/cache/download_input.txt"):
+            print("download_input succeed")
+    except Exception as e:
+        print("download_input failed")
+    return 
+
+
+def DownloadFromQizhi(multi_data_url, data_dir):
+    device_num = int(os.getenv('RANK_SIZE'))
+    if device_num == 1:
+        C2netMultiObsToEnv(multi_data_url,data_dir)
+        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
+    if device_num > 1:
+        # set device_id and init for multi-card training
+        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
+        context.reset_auto_parallel_context()
+        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
+        init()
+        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
+        local_rank=int(os.getenv('RANK_ID'))
+        if local_rank%8==0:
+            C2netMultiObsToEnv(multi_data_url,data_dir)
+        #If the cache file does not exist, it means that the copy data has not been completed,
+        #and Wait for 0th card to finish copying data
+        while not os.path.exists("/cache/download_input.txt"):
+            time.sleep(1)  
+    return
+
+
+def EnvToObs(train_dir, obs_train_url):
+    try:
+        mox.file.copy_parallel(train_dir, obs_train_url)
+        print("Successfully Upload {} to {}".format(train_dir,
+                                                    obs_train_url))
+    except Exception as e:
+        print('moxing upload {} to {} failed: '.format(train_dir,
+                                                    obs_train_url) + str(e))
+    return  
+
+
+def UploadToQizhi(train_dir, obs_train_url):
+    device_num = int(os.getenv('RANK_SIZE'))
+    local_rank=int(os.getenv('RANK_ID'))
+    if device_num == 1:
+        EnvToObs(train_dir, obs_train_url)
+    if device_num > 1:
+        if local_rank%8==0:
+            EnvToObs(train_dir, obs_train_url)
+    return
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # modelarts
+    data_dir = '/cache/dataset'  
+    train_dir = '/cache/output'
+    ckpt_url = '/cache/checkpoint.ckpt'
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    if not os.path.exists(train_dir):
+        os.makedirs(train_dir)
+    print(args.multi_data_url)
+    DownloadFromQizhi(args.multi_data_url, data_dir)
+
+    train(args)
+
+    UploadToQizhi(train_dir,args.train_url)
+
+    # # data sync for cloud platform if enabled
+    # if args.enable_modelarts:
+    #     import moxing as mox
+
+    #     args.data_dir = f"/cache/{args.data_url}"
+    #     mox.file.copy_parallel(src_url=os.path.join(args.data_url, args.dataset), dst_url=args.data_dir)
+
+    # # core training
+    # train(args)
+
+    # if args.enable_modelarts:
+    #     mox.file.copy_parallel(src_url=args.ckpt_save_dir, dst_url=args.train_url)
\ No newline at end of file
diff --git a/train_zhisuan_distillation.py b/train_zhisuan_distillation.py
new file mode 100644
index 0000000000000000000000000000000000000000..512e677fea64d7a176eecc83a077fd97cbe16524
--- /dev/null
+++ b/train_zhisuan_distillation.py
@@ -0,0 +1,455 @@
+""" Model training pipeline """
+import logging
+import os
+import moxing as mox
+import time
+import json
+
+import mindspore as ms
+from mindspore import Tensor, context
+from mindspore.communication import get_group_size, get_rank, init
+
+from mindspore.context import ParallelMode
+
+from mindcv.data import create_dataset, create_loader, create_transforms
+from mindcv.loss import create_loss
+from mindcv.models import create_model
+from mindcv.optim import create_optimizer
+from mindcv.scheduler import create_scheduler
+from mindcv.utils import (
+    AllReduceSum,
+    StateMonitor,
+    # create_trainer,
+    get_metrics,
+    require_customized_train_step,
+    set_logger,
+    set_seed,
+)
+from mindcv.utils.trainer_factory_distillation import create_trainer
+from config import parse_args, save_args  # isort: skip
+# add distillation
+from mindcv.loss.distillation.criterion import get_criterion_by_args
+
+logger = logging.getLogger("mindcv.train")
+
+
+def train(args):
+    """main train function"""
+
+    ms.set_context(mode=args.mode)
+    if args.distribute:
+        init()
+        device_num = get_group_size()
+        rank_id = get_rank()
+        ms.set_auto_parallel_context(
+            device_num=device_num,
+            parallel_mode="data_parallel",
+            gradients_mean=True,
+            # we should but cannot set parameter_broadcast=True, which will cause error on gpu.
+        )
+    else:
+        device_num = None
+        rank_id = None
+
+    set_seed(args.seed)
+    set_logger(name="mindcv", output_dir=args.ckpt_save_dir, rank=rank_id, color=False)
+    logger.info(
+        "We recommend installing `termcolor` via `pip install termcolor` "
+        "and setup logger by `set_logger(..., color=True)`"
+    )
+
+    # create dataset
+    dataset_train = create_dataset(
+        name=args.dataset,
+        root=args.data_dir,
+        split=args.train_split,
+        shuffle=args.shuffle,
+        num_samples=args.num_samples,
+        num_shards=device_num,
+        shard_id=rank_id,
+        num_parallel_workers=args.num_parallel_workers,
+        download=args.dataset_download,
+        num_aug_repeats=args.aug_repeats,
+    )
+
+    if args.num_classes is None:
+        num_classes = dataset_train.num_classes()
+    else:
+        num_classes = args.num_classes
+
+    # create transforms
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits == 3, "Currently, only support 3 splits of augmentation"
+        assert args.auto_augment is not None, "aug_splits should be set with one auto_augment"
+        num_aug_splits = args.aug_splits
+
+    transform_list = create_transforms(
+        dataset_name=args.dataset,
+        is_training=True,
+        image_resize=args.image_resize,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        interpolation=args.interpolation,
+        auto_augment=args.auto_augment,
+        mean=args.mean,
+        std=args.std,
+        re_prob=args.re_prob,
+        re_scale=args.re_scale,
+        re_ratio=args.re_ratio,
+        re_value=args.re_value,
+        re_max_attempts=args.re_max_attempts,
+        separate=num_aug_splits > 0,
+    )
+
+    # load dataset
+    loader_train = create_loader(
+        dataset=dataset_train,
+        batch_size=args.batch_size,
+        drop_remainder=args.drop_remainder,
+        is_training=True,
+        mixup=args.mixup,
+        cutmix=args.cutmix,
+        cutmix_prob=args.cutmix_prob,
+        num_classes=num_classes,
+        transform=transform_list,
+        num_parallel_workers=args.num_parallel_workers,
+        separate=num_aug_splits > 0,
+    )
+
+    if args.val_while_train:
+        dataset_eval = create_dataset(
+            name=args.dataset,
+            root=args.data_dir,
+            split=args.val_split,
+            num_shards=device_num,
+            shard_id=rank_id,
+            num_parallel_workers=args.num_parallel_workers,
+            download=args.dataset_download,
+        )
+
+        transform_list_eval = create_transforms(
+            dataset_name=args.dataset,
+            is_training=False,
+            image_resize=args.image_resize,
+            crop_pct=args.crop_pct,
+            interpolation=args.interpolation,
+            mean=args.mean,
+            std=args.std,
+        )
+
+        loader_eval = create_loader(
+            dataset=dataset_eval,
+            batch_size=args.batch_size,
+            drop_remainder=False,
+            is_training=False,
+            transform=transform_list_eval,
+            num_parallel_workers=args.num_parallel_workers,
+        )
+        # validation dataset count
+        eval_count = dataset_eval.get_dataset_size()
+        if args.distribute:
+            all_reduce = AllReduceSum()
+            eval_count = all_reduce(Tensor(eval_count, ms.int32))
+    else:
+        loader_eval = None
+        eval_count = None
+
+    num_batches = loader_train.get_dataset_size()
+    # Train dataset count
+    train_count = dataset_train.get_dataset_size()
+    if args.distribute:
+        all_reduce = AllReduceSum()
+        train_count = all_reduce(Tensor(train_count, ms.int32))
+
+    # create model
+    network = create_model(
+        model_name=args.model,
+        num_classes=num_classes,
+        in_channels=args.in_channels,
+        drop_rate=args.drop_rate,
+        drop_path_rate=args.drop_path_rate,
+        pretrained=args.pretrained,
+        checkpoint_path=args.ckpt_path,
+        ema=args.ema,
+    )
+
+    num_params = sum([param.size for param in network.get_parameters()])
+
+    # create eval_network loss
+    loss = create_loss(
+        name=args.loss,
+        reduction=args.reduction,
+        label_smoothing=args.label_smoothing,
+        aux_factor=args.aux_factor,
+    )
+
+    # create learning rate schedule
+    lr_scheduler = create_scheduler(
+        num_batches,
+        scheduler=args.scheduler,
+        lr=args.lr,
+        min_lr=args.min_lr,
+        warmup_epochs=args.warmup_epochs,
+        warmup_factor=args.warmup_factor,
+        decay_epochs=args.decay_epochs,
+        decay_rate=args.decay_rate,
+        milestones=args.multi_step_decay_milestones,
+        num_epochs=args.epoch_size,
+        num_cycles=args.num_cycles,
+        cycle_decay=args.cycle_decay,
+        lr_epoch_stair=args.lr_epoch_stair,
+    )
+
+    # resume training if ckpt_path is given
+    if args.ckpt_path != "" and args.resume_opt:
+        opt_ckpt_path = os.path.join(args.ckpt_save_dir, f"optim_{args.model}.ckpt")
+    else:
+        opt_ckpt_path = ""
+
+    # create optimizer
+    # TODO: consistent naming opt, name, dataset_name
+    if (
+            args.loss_scale_type == "fixed"
+            and args.drop_overflow_update is False
+            and not require_customized_train_step(
+        args.ema,
+        args.clip_grad,
+        args.gradient_accumulation_steps,
+        args.amp_cast_list,
+    )
+    ):
+        optimizer_loss_scale = args.loss_scale
+    else:
+        optimizer_loss_scale = 1.0
+    optimizer = create_optimizer(
+        network.trainable_params(),
+        opt=args.opt,
+        lr=lr_scheduler,
+        weight_decay=args.weight_decay,
+        momentum=args.momentum,
+        nesterov=args.use_nesterov,
+        filter_bias_and_bn=args.filter_bias_and_bn,
+        loss_scale=optimizer_loss_scale,
+        checkpoint_path=opt_ckpt_path,
+        eps=args.eps,
+    )
+
+    # Define eval metrics.
+    metrics = get_metrics(num_classes)
+
+    # loss for levit distillation
+    criterion = create_loss(
+        name=args.name_for_distillation,
+        label_smoothing=args.label_smoothing,  # 0.1
+        num_classes=args.num_classes,  # 1000
+        mixup=args.mixup,  # 0.8
+        cutmix=args.cutmix,  # 1.0
+        bce_loss=args.bce_loss,  # flase
+        distillation_type=args.distillation_type,  # hard
+        teacher_path=args.teacher_path,  # ckpt
+        teacher_model=args.teacher_model,  # regnety_160
+        distillation_alpha=args.distillation_alpha,  # 0.5
+        distillation_tau=args.distillation_tau,  # 1.0
+    )
+
+    # create trainer
+    trainer = create_trainer(
+        network,
+        loss,
+        criterion,
+        optimizer,
+        metrics,
+        amp_level=args.amp_level,  # O2
+        amp_cast_list=args.amp_cast_list,  # none
+        loss_scale_type=args.loss_scale_type,  # fixed
+        loss_scale=args.loss_scale,  # 1024
+        drop_overflow_update=args.drop_overflow_update,  # False
+        ema=args.ema,  # False
+        ema_decay=args.ema_decay,  # 0.9999
+        clip_grad=args.clip_grad,  # False
+        clip_value=args.clip_value,  # 15.0
+        gradient_accumulation_steps=args.gradient_accumulation_steps,  # 1
+    )
+
+    # callback
+    # save checkpoint, summary training loss
+    # record val acc and do model selection if val dataset is available
+    begin_step = 0
+    begin_epoch = 0
+    if args.ckpt_path != "":
+        begin_step = optimizer.global_step.asnumpy()[0]
+        begin_epoch = args.ckpt_path.split("/")[-1].split("-")[1].split("_")[0]
+        begin_epoch = int(begin_epoch)
+
+    summary_dir = f"./{args.ckpt_save_dir}/summary"
+    assert (
+            args.ckpt_save_policy != "top_k" or args.val_while_train is True
+    ), "ckpt_save_policy is top_k, val_while_train must be True."
+    state_cb = StateMonitor(
+        trainer,
+        model_name=args.model,
+        model_ema=args.ema,
+        last_epoch=begin_epoch,
+        dataset_sink_mode=args.dataset_sink_mode,
+        dataset_val=loader_eval,
+        metric_name=list(metrics.keys()),
+        val_interval=args.val_interval,
+        ckpt_save_dir=args.ckpt_save_dir,
+        ckpt_save_interval=args.ckpt_save_interval,
+        ckpt_save_policy=args.ckpt_save_policy,
+        ckpt_keep_max=args.keep_checkpoint_max,
+        summary_dir=summary_dir,
+        log_interval=args.log_interval,
+        rank_id=rank_id,
+        device_num=device_num,
+    )
+
+    callbacks = [state_cb]
+    essential_cfg_msg = "\n".join(
+        [
+            "Essential Experiment Configurations:",
+            f"MindSpore mode[GRAPH(0)/PYNATIVE(1)]: {args.mode}",
+            f"Distributed mode: {args.distribute}",
+            f"Number of devices: {device_num if device_num is not None else 1}",
+            f"Number of training samples: {train_count}",
+            f"Number of validation samples: {eval_count}",
+            f"Number of classes: {num_classes}",
+            f"Number of batches: {num_batches}",
+            f"Batch size: {args.batch_size}",
+            f"Auto augment: {args.auto_augment}",
+            f"MixUp: {args.mixup}",
+            f"CutMix: {args.cutmix}",
+            f"Model: {args.model}",
+            f"Model parameters: {num_params}",
+            f"Number of epochs: {args.epoch_size}",
+            f"Optimizer: {args.opt}",
+            f"Learning rate: {args.lr}",
+            f"LR Scheduler: {args.scheduler}",
+            f"Momentum: {args.momentum}",
+            f"Weight decay: {args.weight_decay}",
+            f"Auto mixed precision: {args.amp_level}",
+            f"Loss scale: {args.loss_scale}({args.loss_scale_type})",
+        ]
+    )
+    logger.info(essential_cfg_msg)
+    save_args(args, os.path.join(args.ckpt_save_dir, f"{args.model}.yaml"), rank_id)
+
+    if args.ckpt_path != "":
+        logger.info(f"Resume training from {args.ckpt_path}, last step: {begin_step}, last epoch: {begin_epoch}")
+    else:
+        logger.info("Start training")
+
+    trainer.train(args.epoch_size, loader_train, callbacks=callbacks, dataset_sink_mode=args.dataset_sink_mode)
+
+
+def C2netMultiObsToEnv(multi_data_url, data_dir):
+    # --multi_data_url is json data, need to do json parsing for multi_data_url
+    print(multi_data_url)
+    multi_data_json = json.loads(multi_data_url)
+    for i in range(len(multi_data_json)):
+        zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"]
+        try:
+            mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path)
+            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], zipfile_path))
+            # get filename and unzip the dataset
+            filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0]
+            filePath = data_dir + "/" + filename
+            if not os.path.exists(filePath):
+                os.makedirs(filePath)
+            os.system("unzip {} -d {}".format(zipfile_path, filePath))
+
+        except Exception as e:
+            print('moxing download {} to {} failed: '.format(
+                multi_data_json[i]["dataset_url"], zipfile_path) + str(e))
+    # Set a cache file to determine whether the data has been copied to obs.
+    # If this file exists during multi-card training, there is no need to copy the dataset multiple times.
+    f = open("/cache/download_input.txt", 'w')
+    f.close()
+    try:
+        if os.path.exists("/cache/download_input.txt"):
+            print("download_input succeed")
+    except Exception as e:
+        print("download_input failed")
+    return
+
+
+def DownloadFromQizhi(multi_data_url, data_dir):
+    device_num = int(os.getenv('RANK_SIZE'))
+    if device_num == 1:
+        C2netMultiObsToEnv(multi_data_url, data_dir)
+        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target)
+    if device_num > 1:
+        # set device_id and init for multi-card training
+        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target,
+                            device_id=int(os.getenv('ASCEND_DEVICE_ID')))
+        context.reset_auto_parallel_context()
+        context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
+                                          gradients_mean=True, parameter_broadcast=True)
+        init()
+        # Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
+        local_rank = int(os.getenv('RANK_ID'))
+        if local_rank % 8 == 0:
+            C2netMultiObsToEnv(multi_data_url, data_dir)
+        # If the cache file does not exist, it means that the copy data has not been completed,
+        # and Wait for 0th card to finish copying data
+        while not os.path.exists("/cache/download_input.txt"):
+            time.sleep(1)
+    return
+
+
+def EnvToObs(train_dir, obs_train_url):
+    try:
+        mox.file.copy_parallel(train_dir, obs_train_url)
+        print("Successfully Upload {} to {}".format(train_dir,
+                                                    obs_train_url))
+    except Exception as e:
+        print('moxing upload {} to {} failed: '.format(train_dir,
+                                                       obs_train_url) + str(e))
+    return
+
+
+def UploadToQizhi(train_dir, obs_train_url):
+    device_num = int(os.getenv('RANK_SIZE'))
+    local_rank = int(os.getenv('RANK_ID'))
+    if device_num == 1:
+        EnvToObs(train_dir, obs_train_url)
+    if device_num > 1:
+        if local_rank % 8 == 0:
+            EnvToObs(train_dir, obs_train_url)
+    return
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # modelarts
+    data_dir = '/cache/dataset'
+    train_dir = '/cache/output'
+    ckpt_url = '/cache/checkpoint.ckpt'
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    if not os.path.exists(train_dir):
+        os.makedirs(train_dir)
+    print(args.multi_data_url)
+    DownloadFromQizhi(args.multi_data_url, data_dir)
+
+    train(args)
+
+    UploadToQizhi(train_dir, args.train_url)
+
+    # # data sync for cloud platform if enabled
+    # if args.enable_modelarts:
+    #     import moxing as mox
+
+    #     args.data_dir = f"/cache/{args.data_url}"
+    #     mox.file.copy_parallel(src_url=os.path.join(args.data_url, args.dataset), dst_url=args.data_dir)
+
+    # # core training
+    # train(args)
+
+    # if args.enable_modelarts:
+    #     mox.file.copy_parallel(src_url=args.ckpt_save_dir, dst_url=args.train_url)
diff --git a/train_zhisuan_dump.py b/train_zhisuan_dump.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfc1ff7654c7bb1b52af3f8157b1b33695b3a782
--- /dev/null
+++ b/train_zhisuan_dump.py
@@ -0,0 +1,443 @@
+""" Model training pipeline """
+import logging
+import os
+import numpy as np
+import moxing as mox
+import time
+import json
+
+import mindspore as ms
+from mindspore import FixedLossScaleManager, Model, Tensor, nn, context
+from mindspore.communication import get_group_size, get_rank, init
+
+from mindspore.context import ParallelMode
+
+from mindcv.data import create_dataset, create_loader, create_transforms
+from mindcv.loss import create_loss
+from mindcv.models import create_model
+from mindcv.optim import create_optimizer
+from mindcv.scheduler import create_scheduler
+from mindcv.utils import (
+    AllReduceSum,
+    StateMonitor,
+    create_trainer,
+    get_metrics,
+    require_customized_train_step,
+    set_logger,
+    set_seed,
+)
+
+from config import parse_args, save_args  # isort: skip
+
+logger = logging.getLogger("mindcv.train")
+
+
+def train(args):
+    """main train function"""
+
+    ms.set_context(mode=args.mode)
+    if args.distribute:
+        init()
+        device_num = get_group_size()
+        rank_id = get_rank()
+        ms.set_auto_parallel_context(
+            device_num=device_num,
+            parallel_mode="data_parallel",
+            gradients_mean=True,
+            # we should but cannot set parameter_broadcast=True, which will cause error on gpu.
+        )
+    else:
+        device_num = None
+        rank_id = None
+
+    set_seed(args.seed)
+    set_logger(name="mindcv", output_dir=args.ckpt_save_dir, rank=rank_id, color=False)
+    logger.info(
+        "We recommend installing `termcolor` via `pip install termcolor` "
+        "and setup logger by `set_logger(..., color=True)`"
+    )
+
+    # create dataset
+    dataset_train = create_dataset(
+        name=args.dataset,
+        root=args.data_dir,
+        split=args.train_split,
+        shuffle=args.shuffle,
+        num_samples=args.num_samples,
+        num_shards=device_num,
+        shard_id=rank_id,
+        num_parallel_workers=args.num_parallel_workers,
+        download=args.dataset_download,
+        num_aug_repeats=args.aug_repeats,
+    )
+
+    if args.num_classes is None:
+        num_classes = dataset_train.num_classes()
+    else:
+        num_classes = args.num_classes
+
+    # create transforms
+    num_aug_splits = 0
+    if args.aug_splits > 0:
+        assert args.aug_splits == 3, "Currently, only support 3 splits of augmentation"
+        assert args.auto_augment is not None, "aug_splits should be set with one auto_augment"
+        num_aug_splits = args.aug_splits
+
+    transform_list = create_transforms(
+        dataset_name=args.dataset,
+        is_training=True,
+        image_resize=args.image_resize,
+        scale=args.scale,
+        ratio=args.ratio,
+        hflip=args.hflip,
+        vflip=args.vflip,
+        color_jitter=args.color_jitter,
+        interpolation=args.interpolation,
+        auto_augment=args.auto_augment,
+        mean=args.mean,
+        std=args.std,
+        re_prob=args.re_prob,
+        re_scale=args.re_scale,
+        re_ratio=args.re_ratio,
+        re_value=args.re_value,
+        re_max_attempts=args.re_max_attempts,
+        separate=num_aug_splits > 0,
+    )
+
+    # load dataset
+    loader_train = create_loader(
+        dataset=dataset_train,
+        batch_size=args.batch_size,
+        drop_remainder=args.drop_remainder,
+        is_training=True,
+        mixup=args.mixup,
+        cutmix=args.cutmix,
+        cutmix_prob=args.cutmix_prob,
+        num_classes=num_classes,
+        transform=transform_list,
+        num_parallel_workers=args.num_parallel_workers,
+        separate=num_aug_splits > 0,
+    )
+
+    if args.val_while_train:
+        dataset_eval = create_dataset(
+            name=args.dataset,
+            root=args.data_dir,
+            split=args.val_split,
+            num_shards=device_num,
+            shard_id=rank_id,
+            num_parallel_workers=args.num_parallel_workers,
+            download=args.dataset_download,
+        )
+
+        transform_list_eval = create_transforms(
+            dataset_name=args.dataset,
+            is_training=False,
+            image_resize=args.image_resize,
+            crop_pct=args.crop_pct,
+            interpolation=args.interpolation,
+            mean=args.mean,
+            std=args.std,
+        )
+
+        loader_eval = create_loader(
+            dataset=dataset_eval,
+            batch_size=args.batch_size,
+            drop_remainder=False,
+            is_training=False,
+            transform=transform_list_eval,
+            num_parallel_workers=args.num_parallel_workers,
+        )
+        # validation dataset count
+        eval_count = dataset_eval.get_dataset_size()
+        if args.distribute:
+            all_reduce = AllReduceSum()
+            eval_count = all_reduce(Tensor(eval_count, ms.int32))
+    else:
+        loader_eval = None
+        eval_count = None
+
+    num_batches = loader_train.get_dataset_size()
+    # Train dataset count
+    train_count = dataset_train.get_dataset_size()
+    if args.distribute:
+        all_reduce = AllReduceSum()
+        train_count = all_reduce(Tensor(train_count, ms.int32))
+
+    # create model
+    network = create_model(
+        model_name=args.model,
+        num_classes=num_classes,
+        in_channels=args.in_channels,
+        drop_rate=args.drop_rate,
+        drop_path_rate=args.drop_path_rate,
+        pretrained=args.pretrained,
+        checkpoint_path=args.ckpt_path,
+        ema=args.ema,
+    )
+
+    num_params = sum([param.size for param in network.get_parameters()])
+
+    # create loss
+    loss = create_loss(
+        name=args.loss,
+        reduction=args.reduction,
+        label_smoothing=args.label_smoothing,
+        aux_factor=args.aux_factor,
+    )
+
+    # create learning rate schedule
+    lr_scheduler = create_scheduler(
+        num_batches,
+        scheduler=args.scheduler,
+        lr=args.lr,
+        min_lr=args.min_lr,
+        warmup_epochs=args.warmup_epochs,
+        warmup_factor=args.warmup_factor,
+        decay_epochs=args.decay_epochs,
+        decay_rate=args.decay_rate,
+        milestones=args.multi_step_decay_milestones,
+        num_epochs=args.epoch_size,
+        num_cycles=args.num_cycles,
+        cycle_decay=args.cycle_decay,
+        lr_epoch_stair=args.lr_epoch_stair,
+    )
+
+    # resume training if ckpt_path is given
+    if args.ckpt_path != "" and args.resume_opt:
+        opt_ckpt_path = os.path.join(args.ckpt_save_dir, f"optim_{args.model}.ckpt")
+    else:
+        opt_ckpt_path = ""
+
+    # create optimizer
+    # TODO: consistent naming opt, name, dataset_name
+    if (
+        args.loss_scale_type == "fixed"
+        and args.drop_overflow_update is False
+        and not require_customized_train_step(
+            args.ema,
+            args.clip_grad,
+            args.gradient_accumulation_steps,
+            args.amp_cast_list,
+        )
+    ):
+        optimizer_loss_scale = args.loss_scale
+    else:
+        optimizer_loss_scale = 1.0
+    optimizer = create_optimizer(
+        network.trainable_params(),
+        opt=args.opt,
+        lr=lr_scheduler,
+        weight_decay=args.weight_decay,
+        momentum=args.momentum,
+        nesterov=args.use_nesterov,
+        filter_bias_and_bn=args.filter_bias_and_bn,
+        loss_scale=optimizer_loss_scale,
+        checkpoint_path=opt_ckpt_path,
+        eps=args.eps,
+    )
+
+    # Define eval metrics.
+    metrics = get_metrics(num_classes)
+
+    # create trainer
+    trainer = create_trainer(
+        network,
+        loss,
+        optimizer,
+        metrics,
+        amp_level=args.amp_level,
+        amp_cast_list=args.amp_cast_list,
+        loss_scale_type=args.loss_scale_type,
+        loss_scale=args.loss_scale,
+        drop_overflow_update=args.drop_overflow_update,
+        ema=args.ema,
+        ema_decay=args.ema_decay,
+        clip_grad=args.clip_grad,
+        clip_value=args.clip_value,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+    )
+
+    # callback
+    # save checkpoint, summary training loss
+    # record val acc and do model selection if val dataset is available
+    begin_step = 0
+    begin_epoch = 0
+    if args.ckpt_path != "":
+        begin_step = optimizer.global_step.asnumpy()[0]
+        begin_epoch = args.ckpt_path.split("/")[-1].split("-")[1].split("_")[0]
+        begin_epoch = int(begin_epoch)
+
+    summary_dir = f"./{args.ckpt_save_dir}/summary"
+    assert (
+        args.ckpt_save_policy != "top_k" or args.val_while_train is True
+    ), "ckpt_save_policy is top_k, val_while_train must be True."
+    state_cb = StateMonitor(
+        trainer,
+        model_name=args.model,
+        model_ema=args.ema,
+        last_epoch=begin_epoch,
+        dataset_sink_mode=args.dataset_sink_mode,
+        dataset_val=loader_eval,
+        metric_name=list(metrics.keys()),
+        val_interval=args.val_interval,
+        ckpt_save_dir=args.ckpt_save_dir,
+        ckpt_save_interval=args.ckpt_save_interval,
+        ckpt_save_policy=args.ckpt_save_policy,
+        ckpt_keep_max=args.keep_checkpoint_max,
+        summary_dir=summary_dir,
+        log_interval=args.log_interval,
+        rank_id=rank_id,
+        device_num=device_num,
+    )
+
+    callbacks = [state_cb]
+    essential_cfg_msg = "\n".join(
+        [
+            "Essential Experiment Configurations:",
+            f"MindSpore mode[GRAPH(0)/PYNATIVE(1)]: {args.mode}",
+            f"Distributed mode: {args.distribute}",
+            f"Number of devices: {device_num if device_num is not None else 1}",
+            f"Number of training samples: {train_count}",
+            f"Number of validation samples: {eval_count}",
+            f"Number of classes: {num_classes}",
+            f"Number of batches: {num_batches}",
+            f"Batch size: {args.batch_size}",
+            f"Auto augment: {args.auto_augment}",
+            f"MixUp: {args.mixup}",
+            f"CutMix: {args.cutmix}",
+            f"Model: {args.model}",
+            f"Model parameters: {num_params}",
+            f"Number of epochs: {args.epoch_size}",
+            f"Optimizer: {args.opt}",
+            f"Learning rate: {args.lr}",
+            f"LR Scheduler: {args.scheduler}",
+            f"Momentum: {args.momentum}",
+            f"Weight decay: {args.weight_decay}",
+            f"Auto mixed precision: {args.amp_level}",
+            f"Loss scale: {args.loss_scale}({args.loss_scale_type})",
+        ]
+    )
+    logger.info(essential_cfg_msg)
+    save_args(args, os.path.join(args.ckpt_save_dir, f"{args.model}.yaml"), rank_id)
+
+    if args.ckpt_path != "":
+        logger.info(f"Resume training from {args.ckpt_path}, last step: {begin_step}, last epoch: {begin_epoch}")
+    else:
+        logger.info("Start training")
+
+    trainer.train(args.epoch_size, loader_train, callbacks=callbacks, dataset_sink_mode=args.dataset_sink_mode)
+
+def C2netMultiObsToEnv(multi_data_url, data_dir):
+    #--multi_data_url is json data, need to do json parsing for multi_data_url
+    print(multi_data_url)
+    multi_data_json = json.loads(multi_data_url)  
+    for i in range(len(multi_data_json)):
+        zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"]
+        try:
+            mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) 
+            print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],zipfile_path))
+            #get filename and unzip the dataset
+            filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0]
+            filePath = data_dir + "/" + filename
+            if not os.path.exists(filePath):
+                os.makedirs(filePath)
+            os.system("unzip {} -d {}".format(zipfile_path, filePath))
+
+        except Exception as e:
+            print('moxing download {} to {} failed: '.format(
+                multi_data_json[i]["dataset_url"], zipfile_path) + str(e))
+    #Set a cache file to determine whether the data has been copied to obs. 
+    #If this file exists during multi-card training, there is no need to copy the dataset multiple times.
+    f = open("/cache/download_input.txt", 'w')    
+    f.close()
+    try:
+        if os.path.exists("/cache/download_input.txt"):
+            print("download_input succeed")
+    except Exception as e:
+        print("download_input failed")
+    return 
+
+
+def DownloadFromQizhi(multi_data_url, data_dir):
+    device_num = int(os.getenv('RANK_SIZE'))
+    if device_num == 1:
+        C2netMultiObsToEnv(multi_data_url,data_dir)
+        context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target)
+    if device_num > 1:
+        # set device_id and init for multi-card training
+        context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID')))
+        context.reset_auto_parallel_context()
+        context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True)
+        init()
+        #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data
+        local_rank=int(os.getenv('RANK_ID'))
+        if local_rank%8==0:
+            C2netMultiObsToEnv(multi_data_url,data_dir)
+        #If the cache file does not exist, it means that the copy data has not been completed,
+        #and Wait for 0th card to finish copying data
+        while not os.path.exists("/cache/download_input.txt"):
+            time.sleep(1)  
+    return
+
+
+def EnvToObs(train_dir, obs_train_url):
+    try:
+        mox.file.copy_parallel(train_dir, obs_train_url)
+        print("Successfully Upload {} to {}".format(train_dir,
+                                                    obs_train_url))
+    except Exception as e:
+        print('moxing upload {} to {} failed: '.format(train_dir,
+                                                    obs_train_url) + str(e))
+    return  
+
+
+def UploadToQizhi(train_dir, obs_train_url):
+    device_num = int(os.getenv('RANK_SIZE'))
+    local_rank=int(os.getenv('RANK_ID'))
+    if device_num == 1:
+        EnvToObs(train_dir, obs_train_url)
+    if device_num > 1:
+        if local_rank%8==0:
+            EnvToObs(train_dir, obs_train_url)
+    return
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # os.system("export MINDSPORE_DUMP_CONFIG = "{}"".format(args.dump_path))
+
+
+    os.environ['MINDSPORE_DUMP_CONFIG'] =  '/cache/code/levit_new/data_dump.json'
+    os.environ['MS_DIAGNOSTIC_DATA_PATH'] = '/cache/output'
+
+    # modelarts
+    data_dir = '/cache/dataset'  
+    train_dir = '/cache/output'
+    ckpt_url = '/cache/checkpoint.ckpt'
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+    if not os.path.exists(train_dir):
+        os.makedirs(train_dir)
+    print(args.multi_data_url)
+
+
+
+    DownloadFromQizhi(args.multi_data_url, data_dir)
+
+    train(args)
+
+    UploadToQizhi(train_dir,args.train_url)
+
+    # # data sync for cloud platform if enabled
+    # if args.enable_modelarts:
+    #     import moxing as mox
+
+    #     args.data_dir = f"/cache/{args.data_url}"
+    #     mox.file.copy_parallel(src_url=os.path.join(args.data_url, args.dataset), dst_url=args.data_dir)
+
+    # # core training
+    # train(args)
+
+    # if args.enable_modelarts:
+    #     mox.file.copy_parallel(src_url=args.ckpt_save_dir, dst_url=args.train_url)
\ No newline at end of file