diff --git a/config.py b/config.py index b78e5a9b8f95ce49b50bfc24c2103a1a10844eb5..b1fd1e5230ce8bd4590ae0a8c5760e8aa03fdf19 100644 --- a/config.py +++ b/config.py @@ -23,7 +23,9 @@ def create_parser(): # The first arg parser parses out only the --config argument, this argument is used to # load a yaml file containing key-values that override the defaults for the main parser below parser_config = argparse.ArgumentParser(description='Training Config', add_help=False) - parser_config.add_argument('-c', '--config', type=str, default='', + parser_config.add_argument('-c', '--config', type=str, + default='/home/work/user-job-dir/V0001/configs/levit/levit_128s_new_4NPU.yaml', + # add default help='YAML config file specifying default arguments (default="")') # The main parser. It inherits the --config argument for better help information. @@ -94,7 +96,7 @@ def create_parser(): 'Example: "randaug-m10-n2-w0-mstd0.5-mmax10-inc0", "autoaug-mstd0.5" or autoaugr-mstd0.5.') group.add_argument('--aug_splits', type=int, default=0, help='Number of augmentation splits (default: 0, valid: 3 (currently, only support 3 splits))' - 'it should be set with one auto_augment') + 'it should be set with one auto_augment') group.add_argument('--re_prob', type=float, default=0.0, help='Probability of performing erasing (default=0.0)') group.add_argument('--re_scale', type=tuple, default=(0.02, 0.33), @@ -269,8 +271,33 @@ def create_parser(): help='pre_train_model path in obs') group.add_argument('--train_url', type=str, default='/cache/output/', help='model folder to save/load') + group.add_argument('--pretrain_url', type=str, default='/cache/data/', + help='model pretrained to load') + group.add_argument('--model_url', type=str, default='/cache/output/', + help='path to model') + group.add_argument('--grampus_code_file_name', type=str, default='', + help='code file name') + # add teacher model + group.add_argument('--name_for_distillation', type=str, default='distillation_for_levit', + help='distillation_for_levi') + parser.add_argument('--teacher_model', default='regnety_160', type=str, + choices=['regnety_160'], + help='Name of teacher model to train ' + '(default: "regnety_160"') + parser.add_argument('--teacher_path', type=str, default='') + parser.add_argument('--distillation_type', default='none', + choices=['none', 'soft', 'hard'], type=str, help="") + parser.add_argument('--distillation_alpha', default=0.5, type=float, + help="") + parser.add_argument('--distillation_tau', default=1.0, type=float, help="") + parser.add_argument('--bce_loss', action='store_true') + parser.add_argument('--cutmix_minmax', type=float, nargs='+', default=None, + help='cutmix min/max ratio, overrides ' + 'alpha and enables cutmix if set (default: None)') return parser_config, parser + + # fmt: on diff --git a/configs/levit/levit_128s_GPU.yaml b/configs/levit/levit_128s_GPU.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0f3f9bdc1e475bfb8a6217804870979ed46e66f5 --- /dev/null +++ b/configs/levit/levit_128s_GPU.yaml @@ -0,0 +1,55 @@ +# system config +mode: 0 +distribute: True +num_parallel_workers: 8 + +# dataset config +dataset: 'imagenet' +data_dir: '' +shuffle: True +dataset_download: False +batch_size: 256 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: './' +epoch_size: 300 +dataset_sink_mode: True +amp_level: 'O2' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 250 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False \ No newline at end of file diff --git a/configs/levit/levit_128s_ascend.yaml b/configs/levit/levit_128s_ascend.yaml new file mode 100644 index 0000000000000000000000000000000000000000..88a18587c9a746399bb50edbd8a1ef9ff62a66e4 --- /dev/null +++ b/configs/levit/levit_128s_ascend.yaml @@ -0,0 +1,55 @@ +# system config +mode: 0 +distribute: True +num_parallel_workers: 8 + +# dataset config +dataset: 'imagenet' +data_dir: './imagenet/' +shuffle: True +dataset_download: False +batch_size: 256 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: './ckpt/' +epoch_size: 300 +dataset_sink_mode: True +amp_level: 'O3' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 30 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False \ No newline at end of file diff --git a/configs/levit/levit_128s_ascend_v2.yaml b/configs/levit/levit_128s_ascend_v2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e5206cd98095a91d1e3ec710191bb87360993ea3 --- /dev/null +++ b/configs/levit/levit_128s_ascend_v2.yaml @@ -0,0 +1,72 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# system config +mode: 0 +distribute: True +num_parallel_workers: 8 +val_while_train: True + +# dataset config +dataset: 'imagenet' +data_dir: './imagenet/' +shuffle: True +dataset_download: False +batch_size: 256 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: './ckpt/' +epoch_size: 1000 +dataset_sink_mode: True +amp_level: 'O2' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 345 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False +eps: 1e-8 diff --git a/configs/levit/levit_128s_ascend_v2_plus.yaml b/configs/levit/levit_128s_ascend_v2_plus.yaml new file mode 100644 index 0000000000000000000000000000000000000000..94111b911513962e173cdf4c484e49b1c23bdd3f --- /dev/null +++ b/configs/levit/levit_128s_ascend_v2_plus.yaml @@ -0,0 +1,55 @@ +# system config +mode: 0 +distribute: True +num_parallel_workers: 8 + +# dataset config +dataset: 'imagenet' +data_dir: './imagenet/' +shuffle: True +dataset_download: False +batch_size: 64 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: './ckpt/' +epoch_size: 500 +dataset_sink_mode: True +amp_level: 'O2' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.00005 +min_lr: 0.000001 +warmup_epochs: 5 +decay_epochs: 50 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False \ No newline at end of file diff --git a/configs/levit/levit_128s_ascend_v2_se.yaml b/configs/levit/levit_128s_ascend_v2_se.yaml new file mode 100644 index 0000000000000000000000000000000000000000..90e98cae6413dfdb96a8b9cec2054c748f50cfb3 --- /dev/null +++ b/configs/levit/levit_128s_ascend_v2_se.yaml @@ -0,0 +1,55 @@ +# system config +mode: 0 +distribute: Flase +num_parallel_workers: 1 + +# dataset config +dataset: 'imagenet' +data_dir: './imagenet/' +shuffle: True +dataset_download: False +batch_size: 32 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: './ckpt/' +epoch_size: 300 +dataset_sink_mode: True +amp_level: 'O2' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 30 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False \ No newline at end of file diff --git a/configs/levit/levit_128s_gpu b/configs/levit/levit_128s_gpu new file mode 100644 index 0000000000000000000000000000000000000000..8a95a971baf8aa7a6aca174c7ed4ce5b8d4618c1 --- /dev/null +++ b/configs/levit/levit_128s_gpu @@ -0,0 +1,56 @@ +# system config +mode: 0 +distribute: False +num_parallel_workers: 1 +val_while_train: True + +# dataset config +dataset: 'imagenet' +data_dir: '/cache/dataset/imagenet/imagenet/' +shuffle: True +dataset_download: False +batch_size: 128 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: './ckpt' +epoch_size: 300 +dataset_sink_mode: True +amp_level: 'O3' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 30 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False \ No newline at end of file diff --git a/configs/levit/levit_128s_new.yaml b/configs/levit/levit_128s_new.yaml new file mode 100644 index 0000000000000000000000000000000000000000..459d246f162ad6a16a6eb1c3514940a757e693e2 --- /dev/null +++ b/configs/levit/levit_128s_new.yaml @@ -0,0 +1,58 @@ +# system config +mode: 0 +distribute: True +num_parallel_workers: 8 +val_while_train: True +enable_modelarts: True + +# dataset config +dataset: 'imagenet' +data_dir: '/cache/dataset/imagenet/imagenet/' # './imagenet/' +shuffle: True +dataset_download: False +batch_size: 256 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: '/cache/output/' +epoch_size: 350 +dataset_sink_mode: True +amp_level: 'O2' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 345 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False +eps: 1e-8 diff --git a/configs/levit/levit_128s_new_4NPU.yaml b/configs/levit/levit_128s_new_4NPU.yaml new file mode 100644 index 0000000000000000000000000000000000000000..43df957f482fb22ee9323eaa9e40f1952c6d7fbe --- /dev/null +++ b/configs/levit/levit_128s_new_4NPU.yaml @@ -0,0 +1,57 @@ +# system config +mode: 0 +distribute: True +num_parallel_workers: 4 +val_while_train: True + +# dataset config +dataset: 'imagenet' +data_dir: './imagenet/' +shuffle: True +dataset_download: False +batch_size: 256 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: './ckpt/' +epoch_size: 350 +dataset_sink_mode: True +amp_level: 'O2' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 345 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False +eps: 1e-8 diff --git a/configs/levit/levit_128s_new_distillation.yaml b/configs/levit/levit_128s_new_distillation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f9f3d00d062ddb2f03345b9bf244cf038f2be68e --- /dev/null +++ b/configs/levit/levit_128s_new_distillation.yaml @@ -0,0 +1,67 @@ +# system config +mode: 0 +distribute: True +num_parallel_workers: 8 +val_while_train: True +enable_modelarts: True + +# dataset config +dataset: 'imagenet' +data_dir: '/cache/dataset/imagenet/imagenet/' # './imagenet/' +shuffle: True +dataset_download: False +batch_size: 256 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: '/cache/output/' +epoch_size: 350 +dataset_sink_mode: True +amp_level: 'O2' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 345 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False +eps: 1e-8 + +# Distilation +distillation_type: hard +teacher_path: '/cache/code/levit_new/regnety_160.ckpt' +teacher_model: regnety_160 +distillation_alpha: 0.5 +distillation_tau: 1.0 +bce_loss: false + diff --git a/configs/levit/levit_128s_new_lr.yaml b/configs/levit/levit_128s_new_lr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..856471446d997ae80d5063876461708d2d78e4bd --- /dev/null +++ b/configs/levit/levit_128s_new_lr.yaml @@ -0,0 +1,58 @@ +# system config +mode: 0 +distribute: True +num_parallel_workers: 8 +val_while_train: True +enable_modelarts: True + +# dataset config +dataset: 'imagenet' +data_dir: '/cache/dataset/imagenet/imagenet/' # './imagenet/' +shuffle: True +dataset_download: False +batch_size: 256 +drop_remainder: True + +# Augmentation config +image_resize: 224 +scale: [0.08, 1.0] +ratio: [0.75, 1.333] +hflip: 0.5 +interpolation: 'bicubic' +auto_augment: 'randaug-m9-mstd0.5-inc1' +re_prob: 0.25 +mixup: 0.2 # 0.8 +cutmix: 1.0 +cutmix_prob: 1.0 +color_jitter: 0.4 + +# model config +model: 'LeViT_128S' +num_classes: 1000 +pretrained: False +ckpt_path: '' +keep_checkpoint_max: 10 +ckpt_save_dir: '/cache/output/' +epoch_size: 350 +dataset_sink_mode: True +amp_level: 'O2' + +# loss config +loss: 'CE' +label_smoothing: 0.1 + +# lr scheduler config +scheduler: 'warmup_cosine_decay' +lr: 0.00005 # 0.0005 +min_lr: 0.00001 +warmup_epochs: 5 +decay_epochs: 345 +decay_rate: 0.1 + +# optimizer config +opt: 'adamw' +weight_decay: 0.025 +momentum: 0.9 +loss_scale: 1024 +use_nesterov: False +eps: 1e-8 diff --git a/data_dump.json b/data_dump.json new file mode 100644 index 0000000000000000000000000000000000000000..757dbc05da43bef105e965c545a474609070988c --- /dev/null +++ b/data_dump.json @@ -0,0 +1,14 @@ +{ + "common_dump_settings": { + "dump_mode": 0, + "path": "", + "net_name": "LeViT", + "iteration": "0|3-4", + "saved_data": "tensor", + "input_output": 0, + "kernels": [""], + "support_device": [0,1,2,3,4,5,6,7], + "op_debug_mode": 3, + "file_format": "npy" + } +} \ No newline at end of file diff --git a/mindcv/loss/distillation/__init__.py b/mindcv/loss/distillation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/mindcv/loss/distillation/criterion.py b/mindcv/loss/distillation/criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..01be8fef1f630739c814db087280352913f47dcd --- /dev/null +++ b/mindcv/loss/distillation/criterion.py @@ -0,0 +1,284 @@ +# Copyright 2022 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""functions of criterion""" +import mindspore as ms +import mindspore.nn as nn +from mindspore import Tensor +from mindspore import ops +from mindspore.common import dtype as mstype +from mindspore.nn.loss.loss import LossBase +from mindspore.ops import ( + functional, + operations, + LogSoftmax, + KLDivLoss, + Size +) + +from .regnet import regnety_160 + + +# from .factory import create_teacher_model + +class SoftTargetCrossEntropy(LossBase): + """SoftTargetCrossEntropy for MixUp Augment""" + + def __init__(self): + super(SoftTargetCrossEntropy, self).__init__() + self.mean_ops = operations.ReduceMean(keep_dims=False) + self.sum_ops = operations.ReduceSum(keep_dims=False) + self.log_softmax = operations.LogSoftmax() + + def construct(self, logits, labels): + logits = operations.Cast()(logits, mstype.float32) + labels = operations.Cast()(labels, mstype.float32) + loss = self.sum_ops((-1 * labels) * self.log_softmax(logits), -1) + return self.mean_ops(loss) + + +class CrossEntropySmooth(LossBase): + """CrossEntropy""" + + def __init__(self, sparse=True, reduction='mean', + smooth_factor=0., num_classes=1000): + super(CrossEntropySmooth, self).__init__() + self.onehot = operations.OneHot() + self.sparse = sparse + self.on_value = Tensor(1.0 - smooth_factor, mstype.float32) + self.off_value = Tensor( + 1.0 * smooth_factor / (num_classes - 1), mstype.float32 + ) + self.ce = nn.SoftmaxCrossEntropyWithLogits(reduction=reduction) + self.cast = ops.Cast() + + def construct(self, logits, labels): + if self.sparse: + labels = self.onehot( + labels, functional.shape(logits)[1], + self.on_value, self.off_value + ) + labels = operations.Cast()(labels, mstype.float32) + logits = operations.Cast()(logits, mstype.float32) + loss2 = self.ce(logits, labels) + return loss2 + + +class DistillationLoss(LossBase): + """ + This module wraps a standard criterion and adds an extra knowledge + distillation loss by taking a teacher model prediction and + using it as additional supervision. + """ + + def __init__(self, base_criterion: LossBase, teacher_model: nn.Cell, + distillation_type: str, alpha: float, tau: float): + super().__init__() + self.base_criterion = base_criterion + self.teacher_model = teacher_model + assert distillation_type in ['none', 'soft', 'hard'] + self.distillation_type = distillation_type + self.alpha = alpha + self.tau = tau + + self.kl_div = KLDivLoss(reduction='batchmean') + self.log_softmax = LogSoftmax(axis=1) + self.cross_entropy = nn.CrossEntropyLoss() + + def construct(self, inputs, outputs, labels): + """ + Args: + inputs: The original inputs that are feed to the teacher model + outputs: the outputs of the model to be trained. It is expected to be + either a Tensor, or a Tuple[Tensor, Tensor], with the original output + in the first position and the distillation predictions as the second output + labels: the labels for the base criterion + """ + outputs_kd = None + if not isinstance(outputs, ms.Tensor): + # assume that the model outputs a tuple of [outputs, outputs_kd] + outputs, outputs_kd = outputs + base_loss = self.base_criterion(outputs, labels) + + if self.distillation_type == 'none': + return base_loss + + if outputs_kd is None: + raise ValueError("When knowledge distillation is enabled, the model is " + "expected to return a Tuple[Tensor, Tensor] with the output of the " + "class_token and the dist_token") + + teacher_outputs = self.teacher_model(inputs) + dist_loss = 0.0 + if self.distillation_type == 'soft': + T = self.tau + # taken from https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100 + # with slight modifications + dist_loss = self.kl_div( + self.log_softmax(outputs_kd / T), + # We provide the teacher's targets in log probability because we use log_target=True (as recommended + # in pytorch https://github.com/pytorch/pytorch/blob/9324181d0ac7b4f7949a574dbc3e8be30abe7041/torch + # /nn/functional.py#L2719) but it is possible to give just the probabilities and set + # log_target=False. In our experiments we tried both. + self.log_softmax(teacher_outputs / T), + reduction='sum', + ) * (T * T) / Size()(outputs_kd) + # We divide by outputs_kd.numel() to have the legacy PyTorch behavior. + # But we also experiments output_kd.size(0) + # see issue 61(https://github.com/facebookresearch/deit/issues/61) for more details + elif self.distillation_type == 'hard': + dist_loss = self.cross_entropy( + outputs_kd, teacher_outputs.argmax(axis=1) + ) + + loss = base_loss * (1 - self.alpha) + dist_loss * self.alpha + return loss + + +def get_model_by_name(model_name, **kwargs): + """get network by name and initialize it""" + + models = { + 'regnety_160': regnety_160 + } + return models[model_name](**kwargs) + + +def create_teacher_model( + model_name, + checkpoint_path=None, + **kwargs): + """Create model by name with given parameters""" + + model = get_model_by_name( + model_name, **kwargs + ) + if checkpoint_path is not None: + param_dict = ms.load_checkpoint(checkpoint_path) + ms.load_param_into_net(model, param_dict) + + return model + + +# def get_criterion_by_args(args): +# criterion = get_criterion( +# smoothing=args.smoothing, # 0.1 +# num_classes=args.num_classes, # 1000 +# mixup=args.mixup, # 0.8 +# cutmix=args.cutmix, # 1.0 +# cutmix_minmax=args.cutmix_minmax, # null +# bce_loss=args.bce_loss, # flase +# distillation_type=args.distillation_type, # hard +# teacher_path=args.teacher_path, # ckpt +# teacher_model=args.teacher_model, # regnety_160 +# distillation_alpha=args.distillation_alpha, # 0.5 +# distillation_tau=args.distillation_tau # 1.0 +# ) +# return criterion + +def get_criterion_by_args(args): + criterion = get_criterion( + smoothing=args.label_smoothing, # 0.1 #修改smoothing + num_classes=args.num_classes, # 1000 #ok + mixup=args.mixup, # 0.8 #ok + cutmix=args.cutmix, # 1.0 #ok + # add参数 + cutmix_minmax=args.cutmix_minmax, # null + bce_loss=args.bce_loss, # flase + distillation_type=args.distillation_type, # hard + teacher_path=args.teacher_path, # ckpt + teacher_model=args.teacher_model, # regnety_160 + distillation_alpha=args.distillation_alpha, # 0.5 + distillation_tau=args.distillation_tau # 1.0 + ) + return criterion + + +def get_criterion( + smoothing, + num_classes, + mixup, + cutmix, + # cutmix_minmax, + bce_loss, + distillation_type, + teacher_path, + teacher_model, + distillation_alpha, + distillation_tau +): + """Get criterion function""" + assert smoothing >= 0 + assert smoothing <= 1. + + mixup_active = False + if mixup > 0: + mixup_active = True + if cutmix > 0: + mixup_active = True + + if mixup_active: + # smoothing is handled with mixup label transform + print(25 * "=" + "Using MixBatch" + 25 * "=") + criterion = SoftTargetCrossEntropy() + elif smoothing: + print(25 * "=" + "Using label smoothing" + 25 * "=") + criterion = CrossEntropySmooth(sparse=True, reduction="mean", + smooth_factor=smoothing, + num_classes=num_classes) + else: + criterion = nn.SoftmaxCrossEntropyWithLogits() + + if bce_loss: + criterion = nn.BCEWithLogitsLoss() + + teacher_net = None + if distillation_type != 'none': + assert teacher_path, 'need to specify teacher-path when using distillation' + print(f"Creating teacher model: {teacher_model}") + teacher_net = create_teacher_model( + teacher_model, + checkpoint_path=teacher_path, + ) + teacher_net.set_train(False) + + # wrap the criterion in our custom DistillationLoss, which + # just dispatches to the original criterion if distillation_type is 'none' + criterion = DistillationLoss( + criterion, + teacher_net, # regnety_160 + distillation_type, # none + distillation_alpha, # 0.5 + distillation_tau # 1.0 + ) + + return criterion + + +class NetWithLoss(nn.Cell): + """ + NetWithLoss: Only support Network with Classification. + """ + + def __init__(self, model, criterion): + super(NetWithLoss, self).__init__() + self.model = model + self.criterion = criterion + + def construct(self, *inputs, **kwargs): + data = inputs[0] + label = inputs[1] + predict = self.model(data) + loss = self.criterion(data, predict, label) + return loss diff --git a/mindcv/loss/distillation/regnet.py b/mindcv/loss/distillation/regnet.py new file mode 100644 index 0000000000000000000000000000000000000000..b350b9521be86b2cf0f76a2e3c1b83069157cf9f --- /dev/null +++ b/mindcv/loss/distillation/regnet.py @@ -0,0 +1,650 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + +import types +import functools +import numpy as np +import mindspore as ms +import mindspore.nn as nn +import mindspore.ops as ops + +from mindspore.ops import Div, UniformReal, Floor +from copy import deepcopy +from typing import Callable, Tuple + +# from .mindcv.models.layers.pooling import GlobalAvgPooling + +# from .classifier import ClassifierHead +# from .conv_bn_act import ConvBnAct +# from .drop_path import DropPath +# from .se import SEModule + + +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) + + +def _cfg(url=''): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), + 'pool_size': (7, 7), 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv', 'classifier': 'head.fc', + } + + +def _mcfg(**kwargs): + cfg = dict(se_ratio=0., bottle_ratio=1., stem_width=32) + cfg.update(**kwargs) + return cfg + + +# Model FLOPS = three trailing digits * 10^8 +model_cfgs = dict( + regnetx_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13), + regnetx_004=_mcfg(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22), + regnetx_006=_mcfg(w0=48, wa=36.97, wm=2.24, group_w=24, depth=16), + regnetx_008=_mcfg(w0=56, wa=35.73, wm=2.28, group_w=16, depth=16), + regnetx_016=_mcfg(w0=80, wa=34.01, wm=2.25, group_w=24, depth=18), + regnetx_032=_mcfg(w0=88, wa=26.31, wm=2.25, group_w=48, depth=25), + regnetx_040=_mcfg(w0=96, wa=38.65, wm=2.43, group_w=40, depth=23), + regnetx_064=_mcfg(w0=184, wa=60.83, wm=2.07, group_w=56, depth=17), + regnetx_080=_mcfg(w0=80, wa=49.56, wm=2.88, group_w=120, depth=23), + regnetx_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19), + regnetx_160=_mcfg(w0=216, wa=55.59, wm=2.1, group_w=128, depth=22), + regnetx_320=_mcfg(w0=320, wa=69.86, wm=2.0, group_w=168, depth=23), + regnety_002=_mcfg(w0=24, wa=36.44, wm=2.49, group_w=8, depth=13, se_ratio=0.25), + regnety_004=_mcfg(w0=48, wa=27.89, wm=2.09, group_w=8, depth=16, se_ratio=0.25), + regnety_006=_mcfg(w0=48, wa=32.54, wm=2.32, group_w=16, depth=15, se_ratio=0.25), + regnety_008=_mcfg(w0=56, wa=38.84, wm=2.4, group_w=16, depth=14, se_ratio=0.25), + regnety_016=_mcfg(w0=48, wa=20.71, wm=2.65, group_w=24, depth=27, se_ratio=0.25), + regnety_032=_mcfg(w0=80, wa=42.63, wm=2.66, group_w=24, depth=21, se_ratio=0.25), + regnety_040=_mcfg(w0=96, wa=31.41, wm=2.24, group_w=64, depth=22, se_ratio=0.25), + regnety_064=_mcfg(w0=112, wa=33.22, wm=2.27, group_w=72, depth=25, se_ratio=0.25), + regnety_080=_mcfg(w0=192, wa=76.82, wm=2.19, group_w=56, depth=17, se_ratio=0.25), + regnety_120=_mcfg(w0=168, wa=73.36, wm=2.37, group_w=112, depth=19, se_ratio=0.25), + regnety_160=_mcfg(w0=200, wa=106.23, wm=2.48, group_w=112, depth=18, se_ratio=0.25), + regnety_320=_mcfg(w0=232, wa=115.89, wm=2.53, group_w=232, depth=20, se_ratio=0.25), +) + + +# ClassifierHead +def adaptive_pool_feat_mult(pool_type='avg'): + if pool_type == 'catavgmax': + return 2 + return 1 + + +class GlobalAvgPooling(nn.Cell): + """ + GlobalAvgPooling, same as torch.nn.AdaptiveAvgPool2d when output shape is 1 + """ + + def __init__(self, keep_dims: bool = False) -> None: + super().__init__() + self.keep_dims = keep_dims + + def construct(self, x): + x = ops.mean(x, axis=(2, 3), keep_dims=self.keep_dims) + return x + + +# class SelectAdaptivePool2d(nn.Cell): +# """Selectable global pooling layer with dynamic input kernel size +# """ +# +# def __init__(self, output_size=1, pool_type='avg', flatten=False): +# super(SelectAdaptivePool2d, self).__init__() +# self.pool_type = pool_type or '' +# self.flatten = flatten +# # self.pool = GlobalAvgPooling(output_size) +# self.pool = ms.ops.AdaptiveAvgPool2D(output_size) +# +# def is_identity(self): +# return self.pool_type == '' +# +# def construct(self, *inputs, **kwargs): +# x = inputs[0] +# x = self.pool(x) +# if self.flatten: +# x = ms.nn.Flatten()(x) +# return x +# +# def feat_mult(self): +# return adaptive_pool_feat_mult(self.pool_type) +# +# def __repr__(self): +# return self.__class__.__name__ + ' (' \ +# + 'pool_type=' + self.pool_type \ +# + ', flatten=' + str(self.flatten) + ')' + + +# def create_classifier(num_features, num_classes, pool_type='avg', use_conv=False): +# flatten = not use_conv # flatten when we use a Linear layer after pooling +# if not pool_type: +# assert num_classes == 0 or use_conv, \ +# 'Pooling can only be disabled if classifier is also removed or conv classifier is used' +# flatten = False # disable flattening if pooling is pass-through (no pooling) +# global_pool = SelectAdaptivePool2d(pool_type=pool_type, flatten=flatten) +# # global_pool = GlobalAvgPooling() +# num_pooled_features = num_features * global_pool.feat_mult() +# if num_classes <= 0: +# fc = nn.Identity() # pass-through (no classifier) +# elif use_conv: +# fc = nn.Conv2d(num_pooled_features, num_classes, 1, has_bias=True, pad_mode='valid') +# else: +# # NOTE: using my Linear wrapper that fixes AMP + torchscript casting issue +# fc = nn.Dense(num_pooled_features, num_classes, has_bias=True) +# return global_pool, fc + + +class ClassifierHead(nn.Cell): + """Classifier head w/ configurable global pooling and dropout.""" + + def __init__(self, in_chs, num_classes, pool_type='avg', drop_rate=0.): + super(ClassifierHead, self).__init__() + self.drop_rate = drop_rate + # self.global_pool, self.fc = create_classifier(in_chs, num_classes, pool_type=pool_type) + self.global_pool = GlobalAvgPooling() + self.fc = nn.Dense(in_chs, num_classes, has_bias=True) + self.dropout = nn.Dropout(1.0 - float(self.drop_rate)) + + def construct(self, *inputs, **kwargs): + x = inputs[0] + x = self.global_pool(x) + if self.drop_rate: + x = self.dropout(x) + x = self.fc(x) + return x + + +# ConvBnAct +def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int: + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + return padding + + +# Can SAME padding for given args be done statically? +def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_): + return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0 + + +def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]: + dynamic = False + if isinstance(padding, str): + # for any string padding, the padding will be calculated for you, one of three ways + padding = padding.lower() + if padding == 'same': + # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact + if is_static_pad(kernel_size, **kwargs): + # static case, no extra overhead + padding = get_padding(kernel_size, **kwargs) + else: + # dynamic 'SAME' padding, has runtime/GPU memory overhead + padding = 0 + dynamic = True + elif padding == 'valid': + # 'VALID' padding, same as padding=0 + padding = 0 + else: + # Default to PyTorch style 'same'-ish symmetric padding + padding = get_padding(kernel_size, **kwargs) + return padding, dynamic + + +def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs): + padding = kwargs.pop('padding', 'valid') + kwargs.setdefault('has_bias', False) + padding, _ = get_padding_value(padding, kernel_size, **kwargs) + if padding != 0: + pad_mode = 'pad' + else: + pad_mode = 'valid' + return nn.Conv2d( + in_chs, out_chs, kernel_size, + padding=padding, pad_mode=pad_mode, **kwargs + ) + + +def create_conv2d(in_channels, out_channels, kernel_size, **kwargs): + """ Select a 2d convolution implementation based on arguments + Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d. + + Used extensively by EfficientNet, MobileNetv3 and related networks. + """ + depthwise = kwargs.pop('depthwise', False) + groups = out_channels if depthwise else kwargs.pop('group', 1) + + m = create_conv2d_pad(in_channels, out_channels, kernel_size, group=groups, **kwargs) + return m + + +class BatchNormAct2d(nn.BatchNorm2d): + """BatchNorm + Activation + + This module performs BatchNorm + Activation in a manner that will remain backwards + compatible with weights trained with separate bn, act. This is why we inherit from BN + instead of composing it as a .bn member. + """ + + def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True, + apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None): + super(BatchNormAct2d, self).__init__( + num_features, eps=eps, momentum=1.0 - momentum, affine=affine) + + if act_layer is not None and apply_act: + # act_args = dict(inplace=True) if inplace else {} + self.act = act_layer() + else: + self.act = None + + def _forward_python(self, x): + return super(BatchNormAct2d, self).construct(x) + + def construct(self, *inputs, **kwargs): + x = inputs[0] + x = self._forward_python(x) + if self.act is not None: + x = self.act(x) + return x + + +def convert_norm_act_type(norm_layer, act_layer, norm_kwargs=None): + assert isinstance(norm_layer, (type, str, types.FunctionType, functools.partial)) + assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial)) + norm_act_args = norm_kwargs.copy() if norm_kwargs else {} + norm_act_layer = BatchNormAct2d + # Must pass `act_layer` through for backwards compat where `act_layer=None` implies no activation. + # In the future, may force use of `apply_act` with `act_layer` arg bound to relevant NormAct types + # It is intended that functions/partial does not trigger this, they should define act. + norm_act_args.update(dict(act_layer=act_layer)) + return norm_act_layer, norm_act_args + + +class ConvBnAct(nn.Cell): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1, + norm_layer=nn.BatchNorm2d, norm_kwargs=None, act_layer=nn.ReLU, apply_act=True, + drop_block=None, aa_layer=None): + super(ConvBnAct, self).__init__() + use_aa = aa_layer is not None + + self.conv = create_conv2d( + in_channels, out_channels, kernel_size, stride=1 if use_aa else stride, + padding=padding, dilation=dilation, group=groups, has_bias=False) + + # NOTE for backwards compatibility with models that use separate norm and act layer definitions + norm_act_layer, norm_act_args = convert_norm_act_type(norm_layer, act_layer, norm_kwargs) + self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block, **norm_act_args) + self.aa = aa_layer(channels=out_channels) if stride == 2 and use_aa else None + + @property + def in_channels(self): + return self.conv.in_channels + + @property + def out_channels(self): + return self.conv.out_channels + + def construct(self, *inputs, **kwargs): + x = inputs[0] + x = self.conv(x) + x = self.bn(x) + if self.aa is not None: + x = self.aa(x) + return x + + +# DropPath +def drop_path(x, drop_prob: float = 0., training: bool = False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + + random_tensor = keep_prob + UniformReal()(shape) + random_tensor = Floor()(random_tensor) # binarize + output = Div()(x, keep_prob) * random_tensor + return output + + +# SEModule +class DropPath(nn.Cell): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def construct(self, *inputs, **kwargs): + x = inputs[0] + return drop_path(x, self.drop_prob, self.training) + + +class SEModule(nn.Cell): + + def __init__(self, channels, reduction=16, act_layer=nn.ReLU, min_channels=8, reduction_channels=None, + gate_layer=nn.Sigmoid): + super(SEModule, self).__init__() + reduction_channels = reduction_channels or max(channels // reduction, min_channels) + self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, has_bias=True, pad_mode='valid') + self.act = act_layer() # inplace=True) + self.fc2 = nn.Conv2d(reduction_channels, channels, kernel_size=1, has_bias=True, pad_mode='valid') + self.gate = gate_layer() + + def construct(self, *inputs, **kwargs): + x = inputs[0] + x_se = x.mean((2, 3), keep_dims=True) + x_se = self.fc1(x_se) + x_se = self.act(x_se) + x_se = self.fc2(x_se) + return x * self.gate(x_se) + + +def quantize_float(f, q): + """Converts a float to closest non-zero int divisible by q.""" + return int(round(f / q) * q) + + +def adjust_widths_groups_comp(widths, bottle_ratios, groups): + """Adjusts the compatibility of widths and groups.""" + bottleneck_widths = [int(w * b) for w, b in zip(widths, bottle_ratios)] + groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_widths)] + bottleneck_widths = [quantize_float(w_bot, g) for w_bot, g in + zip(bottleneck_widths, groups)] + widths = [int(w_bot / b) for w_bot, b in + zip(bottleneck_widths, bottle_ratios)] + return widths, groups + + +def generate_regnet(width_slope, width_initial, width_mult, depth, q=8): + """Generates per block widths from RegNet parameters.""" + assert width_slope >= 0 and width_initial > 0 and width_mult > 1 and width_initial % q == 0 + widths_cont = np.arange(depth) * width_slope + width_initial + width_exps = np.round( + np.log(widths_cont / width_initial) / np.log(width_mult)) + widths = width_initial * np.power(width_mult, width_exps) + widths = np.round(np.divide(widths, q)) * q + num_stages, max_stage = len(np.unique(widths)), width_exps.max() + 1 + widths, widths_cont = widths.astype(int).tolist(), widths_cont.tolist() + return widths, num_stages, max_stage, widths_cont + + +class Bottleneck(nn.Cell): + """ RegNet Bottleneck + + This is almost exactly the same as a ResNet Bottleneck. The main difference is the SE block is moved from + after conv3 to after conv2. Otherwise, it's just redefining the arguments for groups/bottleneck channels. + """ + + def __init__(self, in_chs, out_chs, stride=1, dilation=1, + bottleneck_ratio=1, group_width=1, se_ratio=0.25, + downsample=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + aa_layer=None, + drop_block=None, drop_path=None): + + super(Bottleneck, self).__init__() + bottleneck_chs = int(round(out_chs * bottleneck_ratio)) + groups = bottleneck_chs // group_width + + cargs = dict(act_layer=act_layer, norm_layer=norm_layer, + aa_layer=aa_layer, drop_block=drop_block) + self.conv1 = ConvBnAct(in_chs, bottleneck_chs, kernel_size=1, **cargs) + self.conv2 = ConvBnAct( + bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride, + dilation=dilation, + groups=groups, **cargs) + if se_ratio: + se_channels = int(round(in_chs * se_ratio)) + self.se = SEModule(bottleneck_chs, reduction_channels=se_channels) + else: + self.se = None + cargs['act_layer'] = None + self.conv3 = ConvBnAct(bottleneck_chs, out_chs, kernel_size=1, **cargs) + self.act3 = act_layer() + self.downsample = downsample + self.drop_path = drop_path + + def zero_init_last_bn(self): + nn.init.zeros_(self.conv3.bn.weight) + + def construct(self, *inputs, **kwargs): + x = inputs[0] + shortcut = x + x = self.conv1(x) + x = self.conv2(x) + if self.se is not None: + x = self.se(x) + x = self.conv3(x) + if self.drop_path is not None: + x = self.drop_path(x) + if self.downsample is not None: + shortcut = self.downsample(shortcut) + x += shortcut + x = self.act3(x) + return x + + +def downsample_conv( + in_chs, out_chs, kernel_size, stride=1, dilation=1, norm_layer=None): + norm_layer = norm_layer or nn.BatchNorm2d + kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size + dilation = dilation if kernel_size > 1 else 1 + return ConvBnAct( + in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, + norm_layer=norm_layer, act_layer=None) + + +class RegStage(nn.Cell): + """Stage (sequence of blocks w/ the same output shape).""" + + def __init__(self, in_chs, out_chs, stride, dilation, depth, bottle_ratio, + group_width, + block_fn=Bottleneck, se_ratio=0., drop_path_rates=None, + drop_block=None): + super(RegStage, self).__init__() + block_kwargs = {} # FIXME setup to pass various aa, norm, act layer common args + first_dilation = 1 if dilation in (1, 2) else 2 + + list_of_block_fn = [nn.Identity()] + + for i in range(depth): + block_stride = stride if i == 0 else 1 + block_in_chs = in_chs if i == 0 else out_chs + block_dilation = first_dilation if i == 0 else dilation + if drop_path_rates is not None and drop_path_rates[i] > 0.: + drop_path = DropPath(drop_path_rates[i]) + else: + drop_path = None + if (block_in_chs != out_chs) or (block_stride != 1): + proj_block = downsample_conv(block_in_chs, out_chs, 1, + block_stride, block_dilation) + else: + proj_block = None + + list_of_block_fn.append( + block_fn( + block_in_chs, out_chs, block_stride, block_dilation, + bottle_ratio, group_width, se_ratio, + downsample=proj_block, drop_block=drop_block, + drop_path=drop_path, **block_kwargs) + ) + + self.b = nn.CellList(list_of_block_fn) + + def construct(self, *inputs, **kwargs): + x = inputs[0] + for block in self.b: + x = block(x) + return x + + +class RegNet(nn.Cell): + """RegNet model. + + Paper: https://arxiv.org/abs/2003.13678 + + Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py + """ + + def __init__(self, cfg, in_chans=3, num_classes=1000, output_stride=32, + global_pool='avg', drop_rate=0., + drop_path_rate=0., zero_init_last_bn=True): + + super().__init__() + # TODO add drop block, drop path, anti-aliasing, custom bn/act args + self.num_classes = num_classes + self.drop_rate = drop_rate + assert output_stride in (8, 16, 32) + + # Construct the stem + stem_width = cfg['stem_width'] + self.stem = ConvBnAct(in_chans, stem_width, 3, stride=2) + self.feature_info = [ + dict(num_chs=stem_width, reduction=2, module='stem') + ] + + # Construct the stages + prev_width = stem_width + curr_stride = 2 + stage_params = self._get_stage_params(cfg, output_stride=output_stride, + drop_path_rate=drop_path_rate) + se_ratio = cfg['se_ratio'] + list_of_regstages = [nn.Identity()] + for i, stage_args in enumerate(stage_params): + stage_name = "s{}".format(i + 1) + list_of_regstages.append(RegStage(prev_width, **stage_args, se_ratio=se_ratio)) + prev_width = stage_args['out_chs'] + curr_stride *= stage_args['stride'] + self.feature_info += [ + dict(num_chs=prev_width, reduction=curr_stride, + module=stage_name)] + self.s = nn.CellList(list_of_regstages) + # Construct the head + self.num_features = prev_width + self.head = ClassifierHead( + in_chs=prev_width, num_classes=num_classes, pool_type=global_pool, + drop_rate=drop_rate) + + def _get_stage_params(self, cfg, default_stride=2, output_stride=32, + drop_path_rate=0.): + # Generate RegNet ws per block + w_a, w_0, w_m, d = cfg['wa'], cfg['w0'], cfg['wm'], cfg['depth'] + widths, num_stages, _, _ = generate_regnet(w_a, w_0, w_m, d) + + # Convert to per stage format + stage_widths, stage_depths = np.unique(widths, return_counts=True) + + # Use the same group width, bottleneck mult and stride for each stage + stage_groups = [cfg['group_w'] for _ in range(num_stages)] + stage_bottle_ratios = [cfg['bottle_ratio'] for _ in range(num_stages)] + stage_strides = [] + stage_dilations = [] + net_stride = 2 + dilation = 1 + for _ in range(num_stages): + if net_stride >= output_stride: + dilation *= default_stride + stride = 1 + else: + stride = default_stride + net_stride *= stride + stage_strides.append(stride) + stage_dilations.append(dilation) + stage_dpr = np.split(np.linspace(0, drop_path_rate, d), + np.cumsum(stage_depths[:-1])) + + # Adjust the compatibility of ws and gws + stage_widths, stage_groups = adjust_widths_groups_comp(stage_widths, + stage_bottle_ratios, + stage_groups) + param_names = ['out_chs', 'stride', 'dilation', 'depth', + 'bottle_ratio', 'group_width', 'drop_path_rates'] + stage_params = [ + dict(zip(param_names, params)) for params in + zip(stage_widths, stage_strides, stage_dilations, stage_depths, + stage_bottle_ratios, stage_groups, + stage_dpr)] + return stage_params + + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.head = ClassifierHead(self.num_features, num_classes, + pool_type=global_pool, + drop_rate=self.drop_rate) + + def forward_features(self, x): + for block in list(self.children())[:-1]: + x = block(x) + return x + + def construct(self, *inputs, **kwargs): + x = inputs[0] + + x = self.stem(x) + for block in self.s: + x = block(x) + x = self.head(x) + return x + + +def build_model_with_cfg( + model_cls: Callable, # + default_cfg: dict, # + model_cfg: dict = None, # + **kwargs): + model = model_cls(**kwargs) if model_cfg is None else model_cls( + cfg=model_cfg, **kwargs) + model.default_cfg = deepcopy(default_cfg) + return model + + +def _create_regnet(variant, **kwargs): + return build_model_with_cfg( + RegNet, default_cfg=_cfg(), + model_cfg=model_cfgs[variant], **kwargs + ) + + +def regnety_160(**kwargs): + """RegNetY-16GF""" + return _create_regnet('regnety_160', **kwargs) + +# if __name__ == '__main__': +# import numpy as np +# import mindspore +# from mindspore import Tensor +# from mindspore import context +# +# context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU") +# # context.set_context(mode=context.GRAPH_MODE, device_target="CPU") +# +# model = regnety_160() +# print(model) +# dummy_input = Tensor(np.random.rand(4, 3, 224, 224), dtype=mindspore.float32) +# y = model(dummy_input) +# print(y.shape) diff --git a/mindcv/loss/loss_factory.py b/mindcv/loss/loss_factory.py index 54659ec95b9a8d7fdd9325f9c5e4a0c7647e0a97..9284b49216b45584b8b37ff0f1c68e24a6964f08 100644 --- a/mindcv/loss/loss_factory.py +++ b/mindcv/loss/loss_factory.py @@ -7,16 +7,26 @@ from .asymmetric import AsymmetricLossMultilabel, AsymmetricLossSingleLabel from .binary_cross_entropy_smooth import BinaryCrossEntropySmooth from .cross_entropy_smooth import CrossEntropySmooth from .jsd import JSDCrossEntropy +from .distillation.criterion import get_criterion __all__ = ["create_loss"] def create_loss( - name: str = "CE", - weight: Optional[Tensor] = None, - reduction: str = "mean", - label_smoothing: float = 0.0, - aux_factor: float = 0.0, + name: str = "CE", + weight: Optional[Tensor] = None, + reduction: str = "mean", + label_smoothing: float = 0.0, + aux_factor: float = 0.0, + num_classes: int = 1000, + mixup: float = 0.8, + cutmix: float = 1.0, + bce_loss: bool = False, + distillation_type: str = 'hard', + teacher_path: str = '', + teacher_model: str = 'regnety_160', + distillation_alpha: float = 0.5, + distillation_tau: float = 1.0, ): r"""Creates loss function @@ -31,6 +41,15 @@ def create_loss( from overfitting when calculating Loss. The value range is [0.0, 1.0]. Default: 0.0. aux_factor (float): Auxiliary loss factor. Set aux_factor > 0.0 if the model has auxiliary logit outputs (i.e., deep supervision), like inception_v3. Default: 0.0. + num_classes:int =1000, + mixup:float = 0.8, + cutmix:float = 1.0, + bce_loss:boll=False, + distillation_type:str='hard', + teacher_path:str ='path_ckpt', + teacher_model:str ='regnety_160', + distillation_alpha:float=0.5, + distillation_tau:float=1.0, Inputs: - logits (Tensor or Tuple of Tensor): Input logits. Shape [N, C], where N means the number of samples, @@ -58,6 +77,18 @@ def create_loss( loss = AsymmetricLossMultilabel() elif name == "jsd": loss = JSDCrossEntropy(smoothing=label_smoothing, aux_factor=aux_factor, reduction=reduction, weight=weight) + elif name == "distillation_for_levit": + loss = get_criterion( + smoothing=label_smoothing, # 0.1 #修改smoothing + num_classes=num_classes, # 1000 #ok + mixup=mixup, # 0.8 #ok + cutmix=cutmix, # 1.0 #ok + bce_loss=bce_loss, # flase + distillation_type=distillation_type, # hard + teacher_path=teacher_path, # ckpt + teacher_model=teacher_model, # regnety_160 + distillation_alpha=distillation_alpha, # 0.5 + distillation_tau=distillation_tau) else: raise NotImplementedError diff --git a/mindcv/models/__init__.py b/mindcv/models/__init__.py index 6f28ba6f149c208d0d4a8755862443d2f6660496..5eda95bf45809e2e11fa312affc3921a2b9abe82 100644 --- a/mindcv/models/__init__.py +++ b/mindcv/models/__init__.py @@ -54,6 +54,7 @@ from . import ( volo, xception, xcit, + levit, ) from .bit import * from .cait import * @@ -110,6 +111,7 @@ from .vit import * from .volo import * from .xception import * from .xcit import * +from .levit import * # some net module is replaced by the net function with the same name when we do from .net import * # we cannot use net.__all__, so we manually copy net.__all__ here. @@ -168,3 +170,4 @@ __all__.extend(vit.__all__) __all__.extend(volo.__all__) __all__.extend(["Xception", "xception"]) __all__.extend(xcit.__all__) +__all__.extend(levit.__all__) diff --git a/mindcv/models/levit.py b/mindcv/models/levit.py new file mode 100644 index 0000000000000000000000000000000000000000..0edd93ea9afce7c7603c4686c5c2aafa5731f099 --- /dev/null +++ b/mindcv/models/levit.py @@ -0,0 +1,699 @@ +""" +MindSpore implementation of `LeViT`. +Refer to LeViT: LeViT Improving Vision Transformerswith Soft Convolutional Inductive Biases +""" +import itertools +import numpy as np + +import mindspore as ms +import mindspore.nn as nn +import mindspore.ops as ops +import mindspore.common.initializer as init +from mindspore.common.initializer import initializer, TruncatedNormal + +from mindspore import Parameter, Tensor +from mindspore import load_checkpoint, load_param_into_net +# from mindspore.ops import softmax as opsftmx + +from .helpers import load_pretrained +from .registry import register_model + +# ms.set_context(mode=ms.PYNATIVE_MODE) +# ms.set_context(aoe_tune_mode="online") + +# IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +# IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) + +__all__ = [ + "LeViT", + "LeViT_128S", + "LeViT_128", + "LeViT_192", + "LeViT_256", + "LeViT_384", +] + + +def mindspore_params(network): + ms_params = {} + for param in network.get_parameters(): + name = param.name + value = param.data.asnumpy() + print(name, value.shape) + ms_params[name] = value + return ms_params + + +def _cfg(url='', **kwargs): # need to check for + return { + 'url': url, + 'num_classes': 1000, + 'input_size': (3, 224, 224), + # 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + 'LeViT_128S': _cfg(url=''), + 'LeViT_128': _cfg(url=''), + 'LeViT_192': _cfg(url=''), + 'LeViT_256': _cfg(url=''), + 'LeViT_384': _cfg(url='') +} + +FLOPS_COUNTER = 0 + + +# 卷积、全连接和批归一化的各种组合以及残差层 +class Conv2d_BN(nn.SequentialCell): + def __init__(self, + a: int, + b: int, + ks: int = 1, + stride: int = 1, + pad: int = 0, # pad=1 + dilation: int = 1, + group: int = 1, + resolution: int = -10000) -> None: + super().__init__() + # 输入无误 + self.conv = nn.Conv2d(in_channels=a, + out_channels=b, + kernel_size=ks, + stride=stride, + padding=pad, # padding + dilation=dilation, + group=group, + has_bias=False, + pad_mode="pad") + + self.bn = nn.BatchNorm2d(num_features=b, + gamma_init="ones", + beta_init="zeros", + use_batch_statistics=True, + momentum=0.9) # 0.1 + + # global FLOPS_COUNTER + # output_points = ((resolution + 2 * pad - dilation * + # (ks - 1) - 1) // stride + 1) ** 2 + # FLOPS_COUNTER += a * b * output_points * (ks ** 2) // group + + def construct(self, x: Tensor) -> Tensor: + x = self.conv(x) + x = self.bn(x) + return x + + +class Linear_BN(nn.SequentialCell): + def __init__(self, + a: int, + b: int, + resolution: int = -100000) -> None: + super().__init__() + + self.linear = nn.Dense(a, + b, + weight_init='Uniform', + bias_init='Uniform', + has_bias=False) + + self.bn1d = nn.BatchNorm1d(num_features=b, + gamma_init="ones", + beta_init="zeros", + use_batch_statistics=True, + momentum=0.9) # 0.1 + + # global FLOPS_COUNTER + # output_points = resolution ** 2 + # FLOPS_COUNTER += a * b * output_points + + def construct(self, x: Tensor) -> Tensor: + x = self.linear(x) + x1, x2, x3 = x.shape + new_x = ops.reshape(x, (x1 * x2, x3)) + x = self.bn1d(new_x).reshape(x.shape) + return x + + +class BN_Linear(nn.SequentialCell): + def __init__(self, + a: int, + b: int, + bias: bool = True, + std: float = 0.02) -> None: + super().__init__() + + self.bn1d = nn.BatchNorm1d(num_features=a, + gamma_init="ones", + beta_init="zeros", + use_batch_statistics=True, + momentum=0.9) # 0.1 + + self.linear = nn.Dense(a, + b, + weight_init=init.TruncatedNormal(sigma=std), + bias_init='zeros', + has_bias=bias) + + # global FLOPS_COUNTER + # FLOPS_COUNTER += a * b + + def construct(self, x: Tensor) -> Tensor: + x = self.bn1d(x) + x = self.linear(x) + return x + + +class Residual(nn.Cell): + def __init__(self, + m: type = None, + drop: int = 0): + super().__init__() + self.m = m + self.drop = drop + + def construct(self, x: Tensor) -> Tensor: + if self.training and self.drop > 0: + return x + self.m(x) * ms.Tensor.to_tensor( + (np.random.rand(x.shape[0], 1, 1) > self.drop) / (1 - self.drop)) # 可能修改 + else: + y = self.m(x) + x = x + y + return x + + +# def b16(n, activation=nn.HSwish, resolution=224): # CNN 分块嵌入(Patch Embedding) ops.HSwish +# return nn.SequentialCell( +# Conv2d_BN(3, n // 8, 3, 2, 1, resolution=resolution), +# activation(), +# Conv2d_BN(n // 8, n // 4, 3, 2, 1, resolution=resolution // 2), +# activation(), +# Conv2d_BN(n // 4, n // 2, 3, 2, 1, resolution=resolution // 4), +# activation(), +# Conv2d_BN(n // 2, n, 3, 2, 1, resolution=resolution // 8)) + + +class MLP(nn.Cell): # CNN 分块嵌入(Patch Embedding) + def __init__(self, + n: int, + # activation: type = nn.HSwish(), + resolution: int = 224) -> None: + super().__init__() + + self.act = nn.HSwish() + self.cb1 = Conv2d_BN(3, n // 8, 3, 2, 1, resolution=resolution) + self.cb2 = Conv2d_BN(n // 8, n // 4, 3, 2, 1, resolution=resolution // 2) + self.cb3 = Conv2d_BN(n // 4, n // 2, 3, 2, 1, resolution=resolution // 4) + self.cb4 = Conv2d_BN(n // 2, n, 3, 2, 1, resolution=resolution // 8) + + def construct(self, x: Tensor) -> Tensor: + x = self.cb1(x) + x = self.act(x) + x = self.cb2(x) + x = self.act(x) + x = self.cb3(x) + x = self.act(x) + x = self.cb4(x) + return x + + +class Subsample(nn.Cell): # 下采样 + def __init__(self, + stride: int, + resolution: int): + super().__init__() + self.stride = stride + self.resolution = resolution + + def construct(self, x: Tensor) -> Tensor: + B, N, C = x.shape + x = x.view(B, self.resolution, self.resolution, C)[ + :, ::self.stride, ::self.stride].reshape(B, -1, C) + return x + + +class Attention(nn.Cell): # 注意力(Attention) + def __init__(self, + dim: int, + key_dim: int, + num_heads: int = 8, + attn_ratio: int = 4, + activation: type = None, + resolution: int = 14) -> None: + + super().__init__() + + self.num_heads = num_heads + self.scale = key_dim ** -0.5 + self.key_dim = key_dim + self.nh_kd = nh_kd = key_dim * num_heads + self.d = int(attn_ratio * key_dim) + self.dh = int(attn_ratio * key_dim) * num_heads + self.attn_ratio = attn_ratio + h = self.dh + nh_kd * 2 + self.qkv = Linear_BN(dim, h, resolution=resolution) + self.proj = nn.SequentialCell(activation(), Linear_BN(self.dh, dim, resolution=resolution)) + + points = list(itertools.product(range(resolution), range(resolution))) # 迭代两个不同大小的列表来获取新列表 + self.N = len(points) + # self.softmax = ms.ops.softmax(axis=-1) # change from nn.softmax to mindspore.ops.softmax + + attention_offsets = {} + idxs = [] + for p1 in points: + for p2 in points: + offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + + self.attention_biases = ms.Parameter( + Tensor(np.zeros([num_heads, len(attention_offsets)], np.float32))) + + attention_bias_idxs = ms.Tensor(idxs, dtype=ms.int64).view(self.N, self.N) + self.attention_bias_idxs = ms.Parameter(attention_bias_idxs, requires_grad=False) + + self.ab = self.attention_biases[:, self.attention_bias_idxs] + + self.softmax = nn.Softmax(axis=-1) + + # global FLOPS_COUNTER # 可能去除 + # # queries * keys + # FLOPS_COUNTER += num_heads * (resolution ** 4) * key_dim + # # softmax + # FLOPS_COUNTER += num_heads * (resolution ** 4) + # # attention * v + # FLOPS_COUNTER += num_heads * self.d * (resolution ** 4) + + def construct(self, + x: Tensor) -> Tensor: # x (B,N,C) + B, N, C = x.shape + atte = self.qkv(x).view(B, N, self.num_heads, -1) + # atte_np = atte.asnumpy() + qkv = ms.numpy.split(atte, + [self.key_dim, + self.key_dim + self.key_dim], + axis=3) + q = qkv[0] + k = qkv[1] + v = qkv[2] + # q = Tensor(q) + # k = Tensor(k) + # v = Tensor(v) + + q = ops.transpose(q, (0, 2, 1, 3)) + k = ops.transpose(k, (0, 2, 1, 3)) + v = ops.transpose(v, (0, 2, 1, 3)) + + attn = ( + (ops.matmul(q, ops.transpose(k, (0, 1, 3, 2)))) * self.scale + + + (self.attention_biases[:, self.attention_bias_idxs] + if self.training else self.ab) + ) + + # attn = self.softmax(attn) + attn = self.softmax(attn) + + x = ops.transpose((ops.matmul(attn, v)), (0, 2, 1, 3)) + + x = x.reshape(B, N, self.dh) + + x = self.proj(x) + + return x + + +# AttentionSubsample:使用注意力机制的下采样层 +class AttentionSubsample(nn.Cell): + def __init__(self, + in_dim: int, + out_dim: int, + key_dim: int, + num_heads: int = 8, + attn_ratio: int = 2, + activation: type = None, + stride: int = 2, + resolution: int = 14, + resolution_: int = 7) -> None: + super().__init__() + + self.num_heads = num_heads + self.scale = key_dim ** -0.5 + self.key_dim = key_dim + self.nh_kd = nh_kd = key_dim * num_heads + self.d = int(attn_ratio * key_dim) + self.dh = int(attn_ratio * key_dim) * self.num_heads + self.attn_ratio = attn_ratio + self.resolution_ = resolution_ + self.resolution_2 = resolution_ ** 2 + h = self.dh + nh_kd + self.kv = Linear_BN(in_dim, h, resolution=resolution) + + self.q = nn.SequentialCell( + Subsample(stride, resolution), + Linear_BN(in_dim, nh_kd, resolution=resolution_)) + self.proj = nn.SequentialCell(activation(), Linear_BN(self.dh, out_dim, resolution=resolution_)) + # self.proj = Linear_BN(self.dh, out_dim, resolution=resolution) + # self.softmax = ms.ops.softmax(axis=-1) # change from nn.softmax to mindspore.ops.softmax + self.stride = stride + self.resolution = resolution + points = list(itertools.product(range(resolution), range(resolution))) + points_ = list(itertools.product(range(resolution_), range(resolution_))) + + N = len(points) + N_ = len(points_) + attention_offsets = {} + idxs = [] + for p1 in points_: + for p2 in points: + size = 1 + offset = ( + abs(p1[0] * stride - p2[0] + (size - 1) / 2), + abs(p1[1] * stride - p2[1] + (size - 1) / 2)) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + + self.attention_biases = Parameter( + Tensor(np.zeros([num_heads, len(attention_offsets)], np.float32))) + + attention_bias_idxs = (ms.Tensor(idxs, dtype=ms.int64)).view((N_, N)) + self.attention_bias_idxs = ms.Parameter(attention_bias_idxs, requires_grad=False) + + self.ab = self.attention_biases[:, self.attention_bias_idxs] + + self.softmax = nn.Softmax(axis=-1) + + # global FLOPS_COUNTER + # # queries * keys + # FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2) * key_dim + # # softmax + # FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2) + # # attention * v + # FLOPS_COUNTER += num_heads * (resolution ** 2) * (resolution_ ** 2) * self.d + + def construct(self, + x: Tensor) -> Tensor: + + B, N, C = x.shape + atte = self.kv(x).view(B, N, self.num_heads, -1) + # atte_np = atte.asnumpy() + + kv = ms.numpy.split(atte, [self.key_dim], axis=3) + k = kv[0] + v = kv[1] + # k = Tensor(k) + # v = Tensor(v) + v = ops.transpose(v, (0, 2, 1, 3)) + k = ops.transpose(k, (0, 2, 1, 3)) + + q = self.q(x).view(B, self.resolution_2, self.num_heads, self.key_dim) + q = ops.transpose(q, (0, 2, 1, 3)) + + attn = ( + ops.matmul(q, ops.transpose(k, (0, 1, 3, 2))) * self.scale + + (self.attention_biases[:, self.attention_bias_idxs] + if self.training else self.ab) + ) + + # attn = self.softmax(attn) + attn = self.softmax(attn) + + x = ops.transpose((ops.matmul(attn, v)), (0, 2, 1, 3)) + x = x.reshape(B, -1, self.dh) + x = self.proj(x) + return x + + +class LeViT(nn.Cell): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + + def __init__(self, + img_size: int = 224, + patch_size: int = 16, + in_channels: int = 3, + num_classes: int = 1000, + embed_dim: list = [128, 256, 384], + key_dim: list = [16, 16, 16], + depth: list = [2, 3, 4], + num_heads: list = [4, 6, 8], + attn_ratio: list = [2, 2, 2], + mlp_ratio: list = [2, 2, 2], + mlp_n: int = 128, + down_ops: list = [['Subsample', 16, 128 // 16, 4, 2, 2], ['Subsample', 16, 256 // 16, 4, 2, 2]], + attention_activation: type = nn.HSwish, + mlp_activation: type = nn.HSwish, + distillation: bool = False, + drop_path: int = 0): + super().__init__() + + # global FLOPS_COUNTER + self.num_classes = num_classes + self.num_features = embed_dim[-1] + self.embed_dim = embed_dim + self.distillation = distillation + self.patch_embed = MLP(mlp_n) + self.blocks = [] + # print('mlp_activation', type(mlp_activation)) + + down_ops.append(['']) + resolution = img_size // patch_size + for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate( + zip(embed_dim, key_dim, depth, num_heads, attn_ratio, mlp_ratio, down_ops)): + for _ in range(dpth): + self.blocks.append( + Residual(Attention( + ed, kd, nh, + attn_ratio=ar, + activation=attention_activation, + resolution=resolution, + ), drop_path)) + if mr > 0: + h = int(ed * mr) + self.blocks.append( + Residual(nn.SequentialCell( + Linear_BN(ed, h, resolution=resolution), + mlp_activation(), + Linear_BN(h, ed, # bn_weight_init=0, + resolution=resolution), + ), drop_path)) + + if do[0] == 'Subsample': + resolution_ = (resolution - 1) // do[5] + 1 + self.blocks.append( + AttentionSubsample( + *embed_dim[i:i + 2], key_dim=do[1], num_heads=do[2], + attn_ratio=do[3], + activation=attention_activation, + stride=do[5], + resolution=resolution, + resolution_=resolution_)) + resolution = resolution_ + if do[4] > 0: # mlp_ratio + h = int(embed_dim[i + 1] * do[4]) + self.blocks.append( + Residual(nn.SequentialCell( + Linear_BN(embed_dim[i + 1], h, + resolution=resolution), + mlp_activation(), + Linear_BN( + h, embed_dim[i + 1], # bn_weight_init=0, + resolution=resolution), + ), drop_path)) + self.blocks = nn.SequentialCell(*self.blocks) + + # Classifier head + # if num_classes > 0: + self.head = BN_Linear( + embed_dim[-1], num_classes) if num_classes > 0 else nn.Identity() + if distillation: + self.head_dist = BN_Linear( + embed_dim[-1], num_classes) if num_classes > 0 else nn.Identity() + + # self.FLOPS = FLOPS_COUNTER + # FLOPS_COUNTER = 0 + #  self._initialize_weights() + + # def _initialize_weights(self) -> None: + # for _, cell in self.cells_and_names(): + # if isinstance(cell, nn.Dense): + # cell.weight.set_data(init.initializer(init.TruncatedNormal(sigma=.02), cell.weight.data.shape)) + # if cell.bias is not None: + # cell.bias.set_data(init.initializer(init.Constant(0), cell.bias.shape)) + # elif isinstance(cell, nn.LayerNorm): + # cell.gamma.set_data(init.initializer(init.Constant(1.0), cell.gamma.shape)) + # cell.beta.set_data(init.initializer(init.Constant(0), cell.beta.shape)) + + def construct(self, x: Tensor) -> Tensor: + + x = self.patch_embed(x) # 问题? + # print('x1', x.shape) + B, C, H, W = x.shape + x = x.reshape(B, C, H * W) + # print('x.type:', type(x)) + + x = ops.transpose(x, (0, 2, 1)) # ? + + # print('x2', x.shape) + + x = self.blocks(x) # 问题? + + # print('x3', x.shape) + # if self.num_classes > 0: + # x = x.mean(1) + # if self.distillation: + # x = self.head(x), self.head_dist(x) # 问题? + # x = (x[0] + x[1]) / 2 + # else: + # x = self.head(x) + x = x.mean(1) + if self.distillation: + x = self.head(x), self.head_dist(x) # 问题? + if not self.training: + x = (x[0] + x[1]) / 2 + else: + x = self.head(x) + return x + + +@register_model +def LeViT_128S(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT: + default_cfg = default_cfgs['LeViT_128S'] + model = LeViT(in_channels=in_channels, num_classes=num_classes, + embed_dim=[128, 256, 384], + num_heads=[4, 6, 8], + key_dim=[16, 16, 16], + depth=[2, 3, 4], + down_ops=[ + ['Subsample', 16, 128 // 16, 4, 2, 2], + ['Subsample', 16, 256 // 16, 4, 2, 2], + ], + mlp_n=128, + distillation=True, + **kwargs) + model.default_cfg = default_cfg + + # checkpoint_path = "/home/work/user-job-dir/V0076/LeViT-128S.ckpt" + # param_dict = load_checkpoint(checkpoint_path) + # load_param_into_net(model, param_dict) + + # load_checkpoint(checkpoint_path, model) + + # print("****************************print parameter*********************************") + # mindspore_params(model) + + if pretrained: + load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels) + + return model + + +@register_model +def LeViT_128(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT: + default_cfg = default_cfgs['LeViT_128'] + model = LeViT(in_channels=in_channels, num_classes=num_classes, + embed_dim=[128, 256, 384], + num_heads=[4, 8, 12], + key_dim=[16, 16, 16], + depth=[4, 4, 4], + down_ops=[ + ['Subsample', 16, 128 // 16, 4, 2, 2], + ['Subsample', 16, 256 // 16, 4, 2, 2], + ], + mlp_n=128, + distillation=False, + **kwargs) + model.default_cfg = default_cfg + + if pretrained: + load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels) + + return model + + +@register_model +def LeViT_192(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT: + default_cfg = default_cfgs['LeViT_192'] + model = LeViT(in_channels=in_channels, num_classes=num_classes, + embed_dim=[192, 288, 384], + num_heads=[3, 5, 6], + key_dim=[32, 32, 32], + depth=[4, 4, 4], + down_ops=[ + ['Subsample', 32, 192 // 32, 4, 2, 2], + ['Subsample', 32, 288 // 32, 4, 2, 2], + ], + mlp_n=192, + distillation=False, + **kwargs) + model.default_cfg = default_cfg + + if pretrained: + load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels) + + return model + + +@register_model +def LeViT_256(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT: + default_cfg = default_cfgs['LeViT_256'] + model = LeViT(in_channels=in_channels, num_classes=num_classes, + embed_dim=[256, 384, 512], + num_heads=[4, 6, 8], + key_dim=[32, 32, 32], + depth=[4, 4, 4], + down_ops=[ + ['Subsample', 32, 256 // 32, 4, 2, 2], + ['Subsample', 32, 384 // 32, 4, 2, 2], + ], + mlp_n=256, + distillation=False, + **kwargs) + model.default_cfg = default_cfg + + if pretrained: + load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels) + + return model + + +@register_model +def LeViT_384(pretrained: bool = False, num_classes: int = 1000, in_channels=3, **kwargs) -> LeViT: + default_cfg = default_cfgs['LeViT_384'] + model = LeViT(in_channels=in_channels, num_classes=num_classes, + embed_dim=[384, 512, 768], + num_heads=[6, 9, 12], + key_dim=[32, 32, 32], + depth=[4, 4, 4], + down_ops=[ + ['Subsample', 32, 384 // 32, 4, 2, 2], + ['Subsample', 32, 512 // 32, 4, 2, 2], + ], + mlp_n=384, + distillation=False, + **kwargs) + model.default_cfg = default_cfg + + if pretrained: + load_pretrained(model, default_cfg, num_classes=num_classes, in_channels=in_channels) + + return model + +# if __name__ == '__main__': +# import numpy as np +# import mindspore +# from mindspore import Tensor +# from mindspore import context +# +# context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU") +# # context.set_context(mode=context.GRAPH_MODE, device_target="CPU") +# +# model = LeViT_128S() +# print(model) +# dummy_input = Tensor(np.random.rand(4, 3, 224, 224), dtype=mindspore.float32) +# y = model(dummy_input) +# print(y.shape) diff --git a/mindcv/utils/__init__.py b/mindcv/utils/__init__.py index 39b346e0481a2d82ede9049cde85edae19de6848..d2462bf3cf8e41fbb0be9c2db922d134e3c4ec1f 100644 --- a/mindcv/utils/__init__.py +++ b/mindcv/utils/__init__.py @@ -8,4 +8,4 @@ from .path import * from .random import * from .reduce_manager import * from .train_step import * -from .trainer_factory import * +from .trainer_factory_distillation import * diff --git a/mindcv/utils/get_train_one_step.py b/mindcv/utils/get_train_one_step.py new file mode 100644 index 0000000000000000000000000000000000000000..795889876dcb523203e1c7a60d9a448cc4182154 --- /dev/null +++ b/mindcv/utils/get_train_one_step.py @@ -0,0 +1,223 @@ +import mindspore as ms +from mindspore import nn +from mindspore import Tensor +from mindspore import dtype as mstype +from mindspore.common import RowTensor +from mindspore.ops import composite as C +from mindspore.ops import functional as F +from mindspore.ops import operations as P + +# from train_one_step_with_ema import TrainOneStepWithEMA +# from train_one_step_with_scale_and_clip_global_norm import \ +# TrainOneStepWithLossScaleCellGlobalNormClip + +"""TrainOneStepWithEMA""" + +_ema_op = C.MultitypeFuncGraph("grad_ema_op") +assign = P.Assign() +assign_add = P.AssignAdd() + + +@_ema_op.register("Tensor", "Tensor", "Tensor") +def _ema_weights(factor, ema_weight, weight): + """Apply grad sum to cumulative gradient.""" + return assign_add(ema_weight, ema_weight * factor + weight * (1 - factor)) + + +class EMACell(nn.Cell): + """EMACell Define""" + + def __init__(self, weights, ema_decay=0.9999): + super(EMACell, self).__init__() + self.ema_weights = weights.clone(prefix="_ema_weights") + self.ema_decay = Tensor(ema_decay, mstype.float32) + self.hyper_map = C.HyperMap() + + def construct(self, *inputs, **kwargs): + weights = inputs[0] + success = self.hyper_map( + F.partial(_ema_op, self.ema_decay), self.ema_weights, weights + ) + return success + + +_grad_scale = C.MultitypeFuncGraph("grad_scale") +reciprocal = P.Reciprocal() + + +@_grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * F.cast(reciprocal(scale), F.dtype(grad)) + + +@_grad_scale.register("Tensor", "RowTensor") +def tensor_grad_scale_row_tensor(scale, grad): + return RowTensor(grad.indices, + grad.values * F.cast(reciprocal(scale), F.dtype(grad.values)), + grad.dense_shape) + + +_grad_overflow = C.MultitypeFuncGraph("_grad_overflow") +grad_overflow = P.FloatStatus() + + +class TrainOneStepWithEMA(nn.TrainOneStepWithLossScaleCell): + """TrainOneStepWithEMA""" + + def __init__(self, network, optimizer, scale_sense=1.0, with_ema=False, ema_decay=0.9999): + super(TrainOneStepWithEMA, self).__init__(network, optimizer, scale_sense) + self.print = P.Print() + self.with_ema = with_ema + if self.with_ema: + self.ema_model = EMACell(self.weights, ema_decay=ema_decay) + + def construct(self, *inputs): + """construct""" + weights = self.weights + loss = self.network(*inputs) + scaling_sens = self.scale_sense + + status, scaling_sens = self.start_overflow_check(loss, scaling_sens) + + scaling_sens_filled = C.ones_like(loss) * F.cast(scaling_sens, F.dtype(loss)) + grads = self.grad(self.network, weights)(*inputs, scaling_sens_filled) + grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + # get the overflow buffer + cond = self.get_overflow_status(status, grads) + overflow = self.process_loss_scale(cond) + # if there is no overflow, do optimize + if not overflow: + loss = F.depend(loss, self.optimizer(grads)) + if self.with_ema: + self.ema_model(self.weights) + else: + self.print("=============Over Flow, skipping=============") + return loss + + +"""TrainOneStepWithLossScaleCellGlobalNormClip""" + +_grad_scale = C.MultitypeFuncGraph("grad_scale") +reciprocal = P.Reciprocal() + + +@_grad_scale.register("Tensor", "Tensor") +def tensor_grad_scale(scale, grad): + return grad * F.cast(reciprocal(scale), F.dtype(grad)) + + +@_grad_scale.register("Tensor", "RowTensor") +def tensor_grad_scale_row_tensor(scale, grad): + return RowTensor(grad.indices, + grad.values * F.cast(reciprocal(scale), F.dtype(grad.values)), + grad.dense_shape) + + +_grad_overflow = C.MultitypeFuncGraph("_grad_overflow") +grad_overflow = P.FloatStatus() + + +class TrainOneStepWithLossScaleCellGlobalNormClip( + nn.TrainOneStepWithLossScaleCell +): + """ + Encapsulation class of SSD network training. + + Append an optimizer to the training network after that the construct + function can be called to create the backward graph. + + Args: + network (Cell): The training network. Note that loss function should have been added. + optimizer (Optimizer): Optimizer for updating the weights. + sens (Number): The adjust parameter. Default: 1.0. + use_global_nrom(bool): Whether apply global norm before optimizer. Default: False + """ + + def __init__(self, + network, + optimizer, + scale_sense=1.0, + use_global_norm=True, + clip_global_norm_value=1.0): + super(TrainOneStepWithLossScaleCellGlobalNormClip, self).__init__(network, optimizer, scale_sense) + # self.use_global_norm = use_global_norm + self.clip_global_norm_value = clip_global_norm_value + self.print = P.Print() + + def construct(self, *inputs): + """construct""" + weights = self.weights + loss = self.network(*inputs) + scaling_sens = self.scale_sense + + status, scaling_sens = self.start_overflow_check(loss, scaling_sens) + + scaling_sens_filled = C.ones_like(loss) * F.cast(scaling_sens, F.dtype(loss)) + grads = self.grad(self.network, weights)(*inputs, scaling_sens_filled) + grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads) + # apply grad reducer on grads + grads = self.grad_reducer(grads) + # get the overflow buffer + cond = self.get_overflow_status(status, grads) + overflow = self.process_loss_scale(cond) + # if there is no overflow, do optimize + if not overflow: + if self.use_global_norm: + grads = C.clip_by_global_norm(grads, clip_norm=self.clip_global_norm_value) + self.optimizer(grads) + else: + self.print("=============Over Flow, skipping=============") + return loss + + +def get_train_one_step( + network, + optimizer, + ema, + ema_decay, + clip_grad, + clip_value, + gradient_accumulation_steps, + scale_sense): + """get_train_one_step cell""" + # if args.loss_scale_type == 'dynamic': + # print(f"=> Using DynamicLossScaleUpdateCell") + # scale_sense = nn.wrap.loss_scale.DynamicLossScaleUpdateCell( + # loss_scale_value=2 ** 24, scale_factor=2, scale_window=2000 + # ) + # else: + # print( + # "=> Using FixedLossScaleUpdateCell, " + # f"loss_scale_value:{args.loss_scale}" + # ) + # scale_sense = nn.wrap.FixedLossScaleUpdateCell( # 执行 + # loss_scale_value=args.loss_scale + # ) + if ema: + print(f"=> Using EMA. ema_decay: {ema_decay}") + network = TrainOneStepWithEMA( # 执行 + network=network, + optimizer=optimizer, + scale_sense=scale_sense, + with_ema=ema, + ema_decay=ema_decay) + elif clip_grad: + print( + "=> Using gradient clipping by norm, clip_value: " + f"{clip_value}" + ) + network = TrainOneStepWithLossScaleCellGlobalNormClip( + network, + optimizer, + scale_sense, + # use_global_norm=args.clip_grad_norm, + clip_global_norm_value=clip_value + ) + else: + print("=> Use simple loss scale.") + network = nn.TrainOneStepWithLossScaleCell( + network, optimizer, scale_sense=scale_sense + ) + return network diff --git a/mindcv/utils/trainer_factory.py b/mindcv/utils/trainer_factory.py index db47a48e665d8f2061b7dbdffc67fe9dfef17373..cb92651829d5097f7d632bb756b364cc844ae321 100644 --- a/mindcv/utils/trainer_factory.py +++ b/mindcv/utils/trainer_factory.py @@ -34,10 +34,10 @@ def get_metrics(num_classes): def require_customized_train_step( - ema: bool = False, - clip_grad: bool = False, - gradient_accumulation_steps: int = 1, - amp_cast_list: Optional[str] = None, + ema: bool = False, + clip_grad: bool = False, + gradient_accumulation_steps: int = 1, + amp_cast_list: Optional[str] = None, ): if ema: return True @@ -74,20 +74,20 @@ def add_loss_network(network, loss_fn, amp_level): def create_trainer( - network: nn.Cell, - loss: nn.Cell, - optimizer: nn.Cell, - metrics: Union[dict, set], - amp_level: str, - amp_cast_list: str, - loss_scale_type: str, - loss_scale: float = 1.0, - drop_overflow_update: bool = False, - ema: bool = False, - ema_decay: float = 0.9999, - clip_grad: bool = False, - clip_value: float = 15.0, - gradient_accumulation_steps: int = 1, + network: nn.Cell, + loss: nn.Cell, + optimizer: nn.Cell, + metrics: Union[dict, set], + amp_level: str, + amp_cast_list: str, + loss_scale_type: str, + loss_scale: float = 1.0, + drop_overflow_update: bool = False, + ema: bool = False, + ema_decay: float = 0.9999, + clip_grad: bool = False, + clip_value: float = 15.0, + gradient_accumulation_steps: int = 1, ): """Create Trainer. diff --git a/mindcv/utils/trainer_factory_distillation.py b/mindcv/utils/trainer_factory_distillation.py new file mode 100644 index 0000000000000000000000000000000000000000..57a03a15abfa0f1cabb0301156fa7a118afff351 --- /dev/null +++ b/mindcv/utils/trainer_factory_distillation.py @@ -0,0 +1,221 @@ +import logging +from typing import Optional, Union + +import mindspore as ms +from mindspore import Tensor, context +from mindspore import dtype as mstype +from mindspore import nn +from mindspore.ops import functional as F +from mindspore.train import DynamicLossScaleManager, FixedLossScaleManager, Model + +from .amp import auto_mixed_precision +from .train_step import TrainStep +from .get_train_one_step import get_train_one_step + +__all__ = [ + "get_metrics", + "require_customized_train_step", + "create_trainer", +] + +_logger = logging.getLogger(__name__) + + +def get_metrics(num_classes): + if num_classes >= 5: + metrics = { + "Top_1_Accuracy": nn.Top1CategoricalAccuracy(), + "Top_5_Accuracy": nn.Top5CategoricalAccuracy(), + } + else: + metrics = { + "Top_1_Accuracy": nn.Top1CategoricalAccuracy(), + } + return metrics + + +def require_customized_train_step( + ema: bool = False, + clip_grad: bool = False, + gradient_accumulation_steps: int = 1, + amp_cast_list: Optional[str] = None, +): + if ema: + return True + if clip_grad: + return True + if gradient_accumulation_steps > 1: + return True + if amp_cast_list: + return True + return False + + +def add_loss_network(network, loss_fn, amp_level): + """Add loss network.""" + + class WithLossCell(nn.Cell): + "Wrap loss for amp. Cast network output back to float32" + + def __init__(self, backbone, loss_fn): + super(WithLossCell, self).__init__(auto_prefix=False) + self._backbone = backbone + self._loss_fn = loss_fn + + def construct(self, data, label): + out = self._backbone(data) + label = F.mixed_precision_cast(mstype.float32, label) + return self._loss_fn(F.mixed_precision_cast(mstype.float32, out), label) + + if amp_level == "O2" or amp_level == "O3": + network = WithLossCell(network, loss_fn) + else: + network = nn.WithLossCell(network, loss_fn) + return network + + +class NetWithLoss(nn.Cell): + """ + NetWithLoss: Only support Network with Classification. + """ + + def __init__(self, model, criterion): + super(NetWithLoss, self).__init__() + self.model = model + self.criterion = criterion + + def construct(self, *inputs, **kwargs): + data = inputs[0] + label = inputs[1] + predict = self.model(data) + loss = self.criterion(data, predict, label) + return loss + + +def create_trainer( + network: nn.Cell, + loss: nn.Cell, + criterion: nn.Cell, + optimizer: nn.Cell, + metrics: Union[dict, set], + amp_level: str, + amp_cast_list: str, + loss_scale_type: str, + loss_scale: float = 1.0, + drop_overflow_update: bool = False, + ema: bool = False, + ema_decay: float = 0.9999, + clip_grad: bool = False, + clip_value: float = 15.0, + gradient_accumulation_steps: int = 1, +): + """Create Trainer. + + Args: + network: The backbone network to train, evaluate or predict. + loss: The function of eval_network loss. + criterion: The function of calculating loss, + optimizer: The optimizer for training. + metrics: The metrics for model evaluation. + amp_level: The level of auto mixing precision training. + amp_cast_list: At the cell level, custom casting the cell to FP16. + loss_scale_type: The type of loss scale. + loss_scale: The value of loss scale. + drop_overflow_update: Whether to execute optimizer if there is an overflow. + ema: Whether to use exponential moving average of model weights. + ema_decay: Decay factor for model weights moving average. + clip_grad: whether to gradient clip. + clip_value: The value at which to clip gradients. + gradient_accumulation_steps: Accumulate the gradients of n batches before update. + + Returns: + mindspore.Model + + """ + if loss_scale < 1.0: + raise ValueError("Loss scale cannot be less than 1.0!") + + if drop_overflow_update is False and loss_scale_type.lower() == "dynamic": + raise ValueError("DynamicLossScale ALWAYS drop overflow!") + + if gradient_accumulation_steps < 1: + raise ValueError("`gradient_accumulation_steps` must be >= 1!") + + if not require_customized_train_step(ema, clip_grad, gradient_accumulation_steps, amp_cast_list): + net_with_loss = NetWithLoss(network, criterion) + eval_network = nn.WithEvalCell(network, loss, amp_level in ["O2", "O3", "auto"]) + eval_indexes = [0, 1, 2] + + mindspore_kwargs = dict( + network=net_with_loss, + # loss_fn=loss, + optimizer=optimizer, + metrics=metrics, + amp_level=amp_level, + eval_network=eval_network, + eval_indexes=eval_indexes, + ) + if loss_scale_type.lower() == "fixed": + mindspore_kwargs["loss_scale_manager"] = FixedLossScaleManager( + loss_scale=loss_scale, drop_overflow_update=drop_overflow_update + ) + elif loss_scale_type.lower() == "dynamic": + mindspore_kwargs["loss_scale_manager"] = DynamicLossScaleManager( + init_loss_scale=loss_scale, scale_factor=2, scale_window=2000 + ) + elif loss_scale_type.lower() == "auto": + # We don't explicitly construct LossScaleManager + _logger.warning( + "You are using AUTO loss scale, which means the LossScaleManager isn't explicitly pass in " + "when creating a mindspore.Model instance. " + "NOTE: mindspore.Model may use LossScaleManager silently. See mindspore.train.amp for details." + ) + else: + raise ValueError(f"Loss scale type only support ['fixed', 'dynamic', 'auto'], but got{loss_scale_type}.") + model = Model(**mindspore_kwargs) + else: # require customized train step + eval_network = nn.WithEvalCell(network, loss, amp_level in ["O2", "O3", "auto"]) # loss=ce + auto_mixed_precision(network, amp_level, amp_cast_list) + # net_with_loss = add_loss_network(network, loss, amp_level) + # criterion = get_criterion_by_args(args) + net_with_loss = NetWithLoss(network, criterion) # add_loss_network + + train_step_kwargs = dict( + network=net_with_loss, + optimizer=optimizer, + ema=ema, + ema_decay=ema_decay, + clip_grad=clip_grad, + clip_value=clip_value, + gradient_accumulation_steps=gradient_accumulation_steps, + ) + if loss_scale_type.lower() == "fixed": + loss_scale_manager = FixedLossScaleManager(loss_scale=loss_scale, + drop_overflow_update=drop_overflow_update) # scale_sense + elif loss_scale_type.lower() == "dynamic": + loss_scale_manager = DynamicLossScaleManager(init_loss_scale=loss_scale, scale_factor=2, scale_window=2000) + else: + raise ValueError(f"Loss scale type only support ['fixed', 'dynamic'], but got{loss_scale_type}.") + update_cell = loss_scale_manager.get_update_cell() + # 1. loss_scale_type="fixed", drop_overflow_update=False + # --> update_cell=None, TrainStep=TrainOneStepCell(scale_sense=loss_scale) + # 2. loss_scale_type: fixed, drop_overflow_update: True + # --> update_cell=FixedLossScaleUpdateCell, TrainStep=TrainOneStepWithLossScaleCell(scale_sense=update_cell) + # 3. loss_scale_type: dynamic, drop_overflow_update: True + # --> update_cell=DynamicLossScaleUpdateCell, TrainStep=TrainOneStepWithLossScaleCell(scale_sense=update_cell) + if update_cell is None: + train_step_kwargs["scale_sense"] = Tensor(loss_scale, dtype=ms.float32) + else: + if not context.get_context("enable_ge") and context.get_context("device_target") == "CPU": + raise ValueError( + "Only `loss_scale_type` is `fixed` and `drop_overflow_update` is `False`" + "are supported on device `CPU`." + ) + train_step_kwargs["scale_sense"] = update_cell + # train_step_cell = TrainStep(**train_step_kwargs).set_train() + + train_step_cell = get_train_one_step(**train_step_kwargs).set_train() + model = Model(train_step_cell, eval_network=eval_network, metrics=metrics, eval_indexes=[0, 1, 2]) + # + # todo: do we need to set model._loss_scale_manager + return model diff --git a/train_zhisuan.py b/train_zhisuan.py new file mode 100644 index 0000000000000000000000000000000000000000..8eee78f51cec3ba5d4ea8d5281735d3ddcfa2583 --- /dev/null +++ b/train_zhisuan.py @@ -0,0 +1,434 @@ +""" Model training pipeline """ +import logging +import os +import numpy as np +import moxing as mox +import time +import json + +import mindspore as ms +from mindspore import FixedLossScaleManager, Model, Tensor, nn, context +from mindspore.communication import get_group_size, get_rank, init + +from mindspore.context import ParallelMode + +from mindcv.data import create_dataset, create_loader, create_transforms +from mindcv.loss import create_loss +from mindcv.models import create_model +from mindcv.optim import create_optimizer +from mindcv.scheduler import create_scheduler +from mindcv.utils import ( + AllReduceSum, + StateMonitor, + # create_trainer, + get_metrics, + require_customized_train_step, + set_logger, + set_seed, +) +from mindcv.utils.trainer_factory import create_trainer +from config import parse_args, save_args # isort: skip + +logger = logging.getLogger("mindcv.train") + + +def train(args): + """main train function""" + + ms.set_context(mode=args.mode) + if args.distribute: + init() + device_num = get_group_size() + rank_id = get_rank() + ms.set_auto_parallel_context( + device_num=device_num, + parallel_mode="data_parallel", + gradients_mean=True, + # we should but cannot set parameter_broadcast=True, which will cause error on gpu. + ) + else: + device_num = None + rank_id = None + + set_seed(args.seed) + set_logger(name="mindcv", output_dir=args.ckpt_save_dir, rank=rank_id, color=False) + logger.info( + "We recommend installing `termcolor` via `pip install termcolor` " + "and setup logger by `set_logger(..., color=True)`" + ) + + # create dataset + dataset_train = create_dataset( + name=args.dataset, + root=args.data_dir, + split=args.train_split, + shuffle=args.shuffle, + num_samples=args.num_samples, + num_shards=device_num, + shard_id=rank_id, + num_parallel_workers=args.num_parallel_workers, + download=args.dataset_download, + num_aug_repeats=args.aug_repeats, + ) + + if args.num_classes is None: + num_classes = dataset_train.num_classes() + else: + num_classes = args.num_classes + + # create transforms + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits == 3, "Currently, only support 3 splits of augmentation" + assert args.auto_augment is not None, "aug_splits should be set with one auto_augment" + num_aug_splits = args.aug_splits + + transform_list = create_transforms( + dataset_name=args.dataset, + is_training=True, + image_resize=args.image_resize, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + interpolation=args.interpolation, + auto_augment=args.auto_augment, + mean=args.mean, + std=args.std, + re_prob=args.re_prob, + re_scale=args.re_scale, + re_ratio=args.re_ratio, + re_value=args.re_value, + re_max_attempts=args.re_max_attempts, + separate=num_aug_splits > 0, + ) + + # load dataset + loader_train = create_loader( + dataset=dataset_train, + batch_size=args.batch_size, + drop_remainder=args.drop_remainder, + is_training=True, + mixup=args.mixup, + cutmix=args.cutmix, + cutmix_prob=args.cutmix_prob, + num_classes=num_classes, + transform=transform_list, + num_parallel_workers=args.num_parallel_workers, + separate=num_aug_splits > 0, + ) + + if args.val_while_train: + dataset_eval = create_dataset( + name=args.dataset, + root=args.data_dir, + split=args.val_split, + num_shards=device_num, + shard_id=rank_id, + num_parallel_workers=args.num_parallel_workers, + download=args.dataset_download, + ) + + transform_list_eval = create_transforms( + dataset_name=args.dataset, + is_training=False, + image_resize=args.image_resize, + crop_pct=args.crop_pct, + interpolation=args.interpolation, + mean=args.mean, + std=args.std, + ) + + loader_eval = create_loader( + dataset=dataset_eval, + batch_size=args.batch_size, + drop_remainder=False, + is_training=False, + transform=transform_list_eval, + num_parallel_workers=args.num_parallel_workers, + ) + # validation dataset count + eval_count = dataset_eval.get_dataset_size() + if args.distribute: + all_reduce = AllReduceSum() + eval_count = all_reduce(Tensor(eval_count, ms.int32)) + else: + loader_eval = None + eval_count = None + + num_batches = loader_train.get_dataset_size() + # Train dataset count + train_count = dataset_train.get_dataset_size() + if args.distribute: + all_reduce = AllReduceSum() + train_count = all_reduce(Tensor(train_count, ms.int32)) + + # create model + network = create_model( + model_name=args.model, + num_classes=num_classes, + in_channels=args.in_channels, + drop_rate=args.drop_rate, + drop_path_rate=args.drop_path_rate, + pretrained=args.pretrained, + checkpoint_path=args.ckpt_path, + ema=args.ema, + ) + + num_params = sum([param.size for param in network.get_parameters()]) + + # create loss + loss = create_loss( + name=args.loss, + reduction=args.reduction, + label_smoothing=args.label_smoothing, + aux_factor=args.aux_factor, + ) + + # create learning rate schedule + lr_scheduler = create_scheduler( + num_batches, + scheduler=args.scheduler, + lr=args.lr, + min_lr=args.min_lr, + warmup_epochs=args.warmup_epochs, + warmup_factor=args.warmup_factor, + decay_epochs=args.decay_epochs, + decay_rate=args.decay_rate, + milestones=args.multi_step_decay_milestones, + num_epochs=args.epoch_size, + num_cycles=args.num_cycles, + cycle_decay=args.cycle_decay, + lr_epoch_stair=args.lr_epoch_stair, + ) + + # resume training if ckpt_path is given + if args.ckpt_path != "" and args.resume_opt: + opt_ckpt_path = os.path.join(args.ckpt_save_dir, f"optim_{args.model}.ckpt") + else: + opt_ckpt_path = "" + + # create optimizer + # TODO: consistent naming opt, name, dataset_name + if ( + args.loss_scale_type == "fixed" + and args.drop_overflow_update is False + and not require_customized_train_step( + args.ema, + args.clip_grad, + args.gradient_accumulation_steps, + args.amp_cast_list, + ) + ): + optimizer_loss_scale = args.loss_scale + else: + optimizer_loss_scale = 1.0 + optimizer = create_optimizer( + network.trainable_params(), + opt=args.opt, + lr=lr_scheduler, + weight_decay=args.weight_decay, + momentum=args.momentum, + nesterov=args.use_nesterov, + filter_bias_and_bn=args.filter_bias_and_bn, + loss_scale=optimizer_loss_scale, + checkpoint_path=opt_ckpt_path, + eps=args.eps, + ) + + # Define eval metrics. + metrics = get_metrics(num_classes) + + # create trainer + trainer = create_trainer( + network, + loss, + optimizer, + metrics, + amp_level=args.amp_level, + amp_cast_list=args.amp_cast_list, + loss_scale_type=args.loss_scale_type, + loss_scale=args.loss_scale, + drop_overflow_update=args.drop_overflow_update, + ema=args.ema, + ema_decay=args.ema_decay, + clip_grad=args.clip_grad, + clip_value=args.clip_value, + gradient_accumulation_steps=args.gradient_accumulation_steps, + ) + + # callback + # save checkpoint, summary training loss + # record val acc and do model selection if val dataset is available + begin_step = 0 + begin_epoch = 0 + if args.ckpt_path != "": + begin_step = optimizer.global_step.asnumpy()[0] + begin_epoch = args.ckpt_path.split("/")[-1].split("-")[1].split("_")[0] + begin_epoch = int(begin_epoch) + + summary_dir = f"./{args.ckpt_save_dir}/summary" + assert ( + args.ckpt_save_policy != "top_k" or args.val_while_train is True + ), "ckpt_save_policy is top_k, val_while_train must be True." + state_cb = StateMonitor( + trainer, + model_name=args.model, + model_ema=args.ema, + last_epoch=begin_epoch, + dataset_sink_mode=args.dataset_sink_mode, + dataset_val=loader_eval, + metric_name=list(metrics.keys()), + val_interval=args.val_interval, + ckpt_save_dir=args.ckpt_save_dir, + ckpt_save_interval=args.ckpt_save_interval, + ckpt_save_policy=args.ckpt_save_policy, + ckpt_keep_max=args.keep_checkpoint_max, + summary_dir=summary_dir, + log_interval=args.log_interval, + rank_id=rank_id, + device_num=device_num, + ) + + callbacks = [state_cb] + essential_cfg_msg = "\n".join( + [ + "Essential Experiment Configurations:", + f"MindSpore mode[GRAPH(0)/PYNATIVE(1)]: {args.mode}", + f"Distributed mode: {args.distribute}", + f"Number of devices: {device_num if device_num is not None else 1}", + f"Number of training samples: {train_count}", + f"Number of validation samples: {eval_count}", + f"Number of classes: {num_classes}", + f"Number of batches: {num_batches}", + f"Batch size: {args.batch_size}", + f"Auto augment: {args.auto_augment}", + f"MixUp: {args.mixup}", + f"CutMix: {args.cutmix}", + f"Model: {args.model}", + f"Model parameters: {num_params}", + f"Number of epochs: {args.epoch_size}", + f"Optimizer: {args.opt}", + f"Learning rate: {args.lr}", + f"LR Scheduler: {args.scheduler}", + f"Momentum: {args.momentum}", + f"Weight decay: {args.weight_decay}", + f"Auto mixed precision: {args.amp_level}", + f"Loss scale: {args.loss_scale}({args.loss_scale_type})", + ] + ) + logger.info(essential_cfg_msg) + save_args(args, os.path.join(args.ckpt_save_dir, f"{args.model}.yaml"), rank_id) + + if args.ckpt_path != "": + logger.info(f"Resume training from {args.ckpt_path}, last step: {begin_step}, last epoch: {begin_epoch}") + else: + logger.info("Start training") + + trainer.train(args.epoch_size, loader_train, callbacks=callbacks, dataset_sink_mode=args.dataset_sink_mode) + +def C2netMultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + print(multi_data_url) + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"] + try: + mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],zipfile_path)) + #get filename and unzip the dataset + filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0] + filePath = data_dir + "/" + filename + if not os.path.exists(filePath): + os.makedirs(filePath) + os.system("unzip {} -d {}".format(zipfile_path, filePath)) + + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], zipfile_path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + C2netMultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + C2netMultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + + +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + + +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + +if __name__ == "__main__": + args = parse_args() + + # modelarts + data_dir = '/cache/dataset' + train_dir = '/cache/output' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + print(args.multi_data_url) + DownloadFromQizhi(args.multi_data_url, data_dir) + + train(args) + + UploadToQizhi(train_dir,args.train_url) + + # # data sync for cloud platform if enabled + # if args.enable_modelarts: + # import moxing as mox + + # args.data_dir = f"/cache/{args.data_url}" + # mox.file.copy_parallel(src_url=os.path.join(args.data_url, args.dataset), dst_url=args.data_dir) + + # # core training + # train(args) + + # if args.enable_modelarts: + # mox.file.copy_parallel(src_url=args.ckpt_save_dir, dst_url=args.train_url) \ No newline at end of file diff --git a/train_zhisuan_distillation.py b/train_zhisuan_distillation.py new file mode 100644 index 0000000000000000000000000000000000000000..512e677fea64d7a176eecc83a077fd97cbe16524 --- /dev/null +++ b/train_zhisuan_distillation.py @@ -0,0 +1,455 @@ +""" Model training pipeline """ +import logging +import os +import moxing as mox +import time +import json + +import mindspore as ms +from mindspore import Tensor, context +from mindspore.communication import get_group_size, get_rank, init + +from mindspore.context import ParallelMode + +from mindcv.data import create_dataset, create_loader, create_transforms +from mindcv.loss import create_loss +from mindcv.models import create_model +from mindcv.optim import create_optimizer +from mindcv.scheduler import create_scheduler +from mindcv.utils import ( + AllReduceSum, + StateMonitor, + # create_trainer, + get_metrics, + require_customized_train_step, + set_logger, + set_seed, +) +from mindcv.utils.trainer_factory_distillation import create_trainer +from config import parse_args, save_args # isort: skip +# add distillation +from mindcv.loss.distillation.criterion import get_criterion_by_args + +logger = logging.getLogger("mindcv.train") + + +def train(args): + """main train function""" + + ms.set_context(mode=args.mode) + if args.distribute: + init() + device_num = get_group_size() + rank_id = get_rank() + ms.set_auto_parallel_context( + device_num=device_num, + parallel_mode="data_parallel", + gradients_mean=True, + # we should but cannot set parameter_broadcast=True, which will cause error on gpu. + ) + else: + device_num = None + rank_id = None + + set_seed(args.seed) + set_logger(name="mindcv", output_dir=args.ckpt_save_dir, rank=rank_id, color=False) + logger.info( + "We recommend installing `termcolor` via `pip install termcolor` " + "and setup logger by `set_logger(..., color=True)`" + ) + + # create dataset + dataset_train = create_dataset( + name=args.dataset, + root=args.data_dir, + split=args.train_split, + shuffle=args.shuffle, + num_samples=args.num_samples, + num_shards=device_num, + shard_id=rank_id, + num_parallel_workers=args.num_parallel_workers, + download=args.dataset_download, + num_aug_repeats=args.aug_repeats, + ) + + if args.num_classes is None: + num_classes = dataset_train.num_classes() + else: + num_classes = args.num_classes + + # create transforms + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits == 3, "Currently, only support 3 splits of augmentation" + assert args.auto_augment is not None, "aug_splits should be set with one auto_augment" + num_aug_splits = args.aug_splits + + transform_list = create_transforms( + dataset_name=args.dataset, + is_training=True, + image_resize=args.image_resize, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + interpolation=args.interpolation, + auto_augment=args.auto_augment, + mean=args.mean, + std=args.std, + re_prob=args.re_prob, + re_scale=args.re_scale, + re_ratio=args.re_ratio, + re_value=args.re_value, + re_max_attempts=args.re_max_attempts, + separate=num_aug_splits > 0, + ) + + # load dataset + loader_train = create_loader( + dataset=dataset_train, + batch_size=args.batch_size, + drop_remainder=args.drop_remainder, + is_training=True, + mixup=args.mixup, + cutmix=args.cutmix, + cutmix_prob=args.cutmix_prob, + num_classes=num_classes, + transform=transform_list, + num_parallel_workers=args.num_parallel_workers, + separate=num_aug_splits > 0, + ) + + if args.val_while_train: + dataset_eval = create_dataset( + name=args.dataset, + root=args.data_dir, + split=args.val_split, + num_shards=device_num, + shard_id=rank_id, + num_parallel_workers=args.num_parallel_workers, + download=args.dataset_download, + ) + + transform_list_eval = create_transforms( + dataset_name=args.dataset, + is_training=False, + image_resize=args.image_resize, + crop_pct=args.crop_pct, + interpolation=args.interpolation, + mean=args.mean, + std=args.std, + ) + + loader_eval = create_loader( + dataset=dataset_eval, + batch_size=args.batch_size, + drop_remainder=False, + is_training=False, + transform=transform_list_eval, + num_parallel_workers=args.num_parallel_workers, + ) + # validation dataset count + eval_count = dataset_eval.get_dataset_size() + if args.distribute: + all_reduce = AllReduceSum() + eval_count = all_reduce(Tensor(eval_count, ms.int32)) + else: + loader_eval = None + eval_count = None + + num_batches = loader_train.get_dataset_size() + # Train dataset count + train_count = dataset_train.get_dataset_size() + if args.distribute: + all_reduce = AllReduceSum() + train_count = all_reduce(Tensor(train_count, ms.int32)) + + # create model + network = create_model( + model_name=args.model, + num_classes=num_classes, + in_channels=args.in_channels, + drop_rate=args.drop_rate, + drop_path_rate=args.drop_path_rate, + pretrained=args.pretrained, + checkpoint_path=args.ckpt_path, + ema=args.ema, + ) + + num_params = sum([param.size for param in network.get_parameters()]) + + # create eval_network loss + loss = create_loss( + name=args.loss, + reduction=args.reduction, + label_smoothing=args.label_smoothing, + aux_factor=args.aux_factor, + ) + + # create learning rate schedule + lr_scheduler = create_scheduler( + num_batches, + scheduler=args.scheduler, + lr=args.lr, + min_lr=args.min_lr, + warmup_epochs=args.warmup_epochs, + warmup_factor=args.warmup_factor, + decay_epochs=args.decay_epochs, + decay_rate=args.decay_rate, + milestones=args.multi_step_decay_milestones, + num_epochs=args.epoch_size, + num_cycles=args.num_cycles, + cycle_decay=args.cycle_decay, + lr_epoch_stair=args.lr_epoch_stair, + ) + + # resume training if ckpt_path is given + if args.ckpt_path != "" and args.resume_opt: + opt_ckpt_path = os.path.join(args.ckpt_save_dir, f"optim_{args.model}.ckpt") + else: + opt_ckpt_path = "" + + # create optimizer + # TODO: consistent naming opt, name, dataset_name + if ( + args.loss_scale_type == "fixed" + and args.drop_overflow_update is False + and not require_customized_train_step( + args.ema, + args.clip_grad, + args.gradient_accumulation_steps, + args.amp_cast_list, + ) + ): + optimizer_loss_scale = args.loss_scale + else: + optimizer_loss_scale = 1.0 + optimizer = create_optimizer( + network.trainable_params(), + opt=args.opt, + lr=lr_scheduler, + weight_decay=args.weight_decay, + momentum=args.momentum, + nesterov=args.use_nesterov, + filter_bias_and_bn=args.filter_bias_and_bn, + loss_scale=optimizer_loss_scale, + checkpoint_path=opt_ckpt_path, + eps=args.eps, + ) + + # Define eval metrics. + metrics = get_metrics(num_classes) + + # loss for levit distillation + criterion = create_loss( + name=args.name_for_distillation, + label_smoothing=args.label_smoothing, # 0.1 + num_classes=args.num_classes, # 1000 + mixup=args.mixup, # 0.8 + cutmix=args.cutmix, # 1.0 + bce_loss=args.bce_loss, # flase + distillation_type=args.distillation_type, # hard + teacher_path=args.teacher_path, # ckpt + teacher_model=args.teacher_model, # regnety_160 + distillation_alpha=args.distillation_alpha, # 0.5 + distillation_tau=args.distillation_tau, # 1.0 + ) + + # create trainer + trainer = create_trainer( + network, + loss, + criterion, + optimizer, + metrics, + amp_level=args.amp_level, # O2 + amp_cast_list=args.amp_cast_list, # none + loss_scale_type=args.loss_scale_type, # fixed + loss_scale=args.loss_scale, # 1024 + drop_overflow_update=args.drop_overflow_update, # False + ema=args.ema, # False + ema_decay=args.ema_decay, # 0.9999 + clip_grad=args.clip_grad, # False + clip_value=args.clip_value, # 15.0 + gradient_accumulation_steps=args.gradient_accumulation_steps, # 1 + ) + + # callback + # save checkpoint, summary training loss + # record val acc and do model selection if val dataset is available + begin_step = 0 + begin_epoch = 0 + if args.ckpt_path != "": + begin_step = optimizer.global_step.asnumpy()[0] + begin_epoch = args.ckpt_path.split("/")[-1].split("-")[1].split("_")[0] + begin_epoch = int(begin_epoch) + + summary_dir = f"./{args.ckpt_save_dir}/summary" + assert ( + args.ckpt_save_policy != "top_k" or args.val_while_train is True + ), "ckpt_save_policy is top_k, val_while_train must be True." + state_cb = StateMonitor( + trainer, + model_name=args.model, + model_ema=args.ema, + last_epoch=begin_epoch, + dataset_sink_mode=args.dataset_sink_mode, + dataset_val=loader_eval, + metric_name=list(metrics.keys()), + val_interval=args.val_interval, + ckpt_save_dir=args.ckpt_save_dir, + ckpt_save_interval=args.ckpt_save_interval, + ckpt_save_policy=args.ckpt_save_policy, + ckpt_keep_max=args.keep_checkpoint_max, + summary_dir=summary_dir, + log_interval=args.log_interval, + rank_id=rank_id, + device_num=device_num, + ) + + callbacks = [state_cb] + essential_cfg_msg = "\n".join( + [ + "Essential Experiment Configurations:", + f"MindSpore mode[GRAPH(0)/PYNATIVE(1)]: {args.mode}", + f"Distributed mode: {args.distribute}", + f"Number of devices: {device_num if device_num is not None else 1}", + f"Number of training samples: {train_count}", + f"Number of validation samples: {eval_count}", + f"Number of classes: {num_classes}", + f"Number of batches: {num_batches}", + f"Batch size: {args.batch_size}", + f"Auto augment: {args.auto_augment}", + f"MixUp: {args.mixup}", + f"CutMix: {args.cutmix}", + f"Model: {args.model}", + f"Model parameters: {num_params}", + f"Number of epochs: {args.epoch_size}", + f"Optimizer: {args.opt}", + f"Learning rate: {args.lr}", + f"LR Scheduler: {args.scheduler}", + f"Momentum: {args.momentum}", + f"Weight decay: {args.weight_decay}", + f"Auto mixed precision: {args.amp_level}", + f"Loss scale: {args.loss_scale}({args.loss_scale_type})", + ] + ) + logger.info(essential_cfg_msg) + save_args(args, os.path.join(args.ckpt_save_dir, f"{args.model}.yaml"), rank_id) + + if args.ckpt_path != "": + logger.info(f"Resume training from {args.ckpt_path}, last step: {begin_step}, last epoch: {begin_epoch}") + else: + logger.info("Start training") + + trainer.train(args.epoch_size, loader_train, callbacks=callbacks, dataset_sink_mode=args.dataset_sink_mode) + + +def C2netMultiObsToEnv(multi_data_url, data_dir): + # --multi_data_url is json data, need to do json parsing for multi_data_url + print(multi_data_url) + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"] + try: + mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"], zipfile_path)) + # get filename and unzip the dataset + filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0] + filePath = data_dir + "/" + filename + if not os.path.exists(filePath): + os.makedirs(filePath) + os.system("unzip {} -d {}".format(zipfile_path, filePath)) + + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], zipfile_path) + str(e)) + # Set a cache file to determine whether the data has been copied to obs. + # If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + C2netMultiObsToEnv(multi_data_url, data_dir) + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, + device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, + gradients_mean=True, parameter_broadcast=True) + init() + # Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank = int(os.getenv('RANK_ID')) + if local_rank % 8 == 0: + C2netMultiObsToEnv(multi_data_url, data_dir) + # If the cache file does not exist, it means that the copy data has not been completed, + # and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + + +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + + +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank = int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank % 8 == 0: + EnvToObs(train_dir, obs_train_url) + return + + +if __name__ == "__main__": + args = parse_args() + + # modelarts + data_dir = '/cache/dataset' + train_dir = '/cache/output' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + print(args.multi_data_url) + DownloadFromQizhi(args.multi_data_url, data_dir) + + train(args) + + UploadToQizhi(train_dir, args.train_url) + + # # data sync for cloud platform if enabled + # if args.enable_modelarts: + # import moxing as mox + + # args.data_dir = f"/cache/{args.data_url}" + # mox.file.copy_parallel(src_url=os.path.join(args.data_url, args.dataset), dst_url=args.data_dir) + + # # core training + # train(args) + + # if args.enable_modelarts: + # mox.file.copy_parallel(src_url=args.ckpt_save_dir, dst_url=args.train_url) diff --git a/train_zhisuan_dump.py b/train_zhisuan_dump.py new file mode 100644 index 0000000000000000000000000000000000000000..dfc1ff7654c7bb1b52af3f8157b1b33695b3a782 --- /dev/null +++ b/train_zhisuan_dump.py @@ -0,0 +1,443 @@ +""" Model training pipeline """ +import logging +import os +import numpy as np +import moxing as mox +import time +import json + +import mindspore as ms +from mindspore import FixedLossScaleManager, Model, Tensor, nn, context +from mindspore.communication import get_group_size, get_rank, init + +from mindspore.context import ParallelMode + +from mindcv.data import create_dataset, create_loader, create_transforms +from mindcv.loss import create_loss +from mindcv.models import create_model +from mindcv.optim import create_optimizer +from mindcv.scheduler import create_scheduler +from mindcv.utils import ( + AllReduceSum, + StateMonitor, + create_trainer, + get_metrics, + require_customized_train_step, + set_logger, + set_seed, +) + +from config import parse_args, save_args # isort: skip + +logger = logging.getLogger("mindcv.train") + + +def train(args): + """main train function""" + + ms.set_context(mode=args.mode) + if args.distribute: + init() + device_num = get_group_size() + rank_id = get_rank() + ms.set_auto_parallel_context( + device_num=device_num, + parallel_mode="data_parallel", + gradients_mean=True, + # we should but cannot set parameter_broadcast=True, which will cause error on gpu. + ) + else: + device_num = None + rank_id = None + + set_seed(args.seed) + set_logger(name="mindcv", output_dir=args.ckpt_save_dir, rank=rank_id, color=False) + logger.info( + "We recommend installing `termcolor` via `pip install termcolor` " + "and setup logger by `set_logger(..., color=True)`" + ) + + # create dataset + dataset_train = create_dataset( + name=args.dataset, + root=args.data_dir, + split=args.train_split, + shuffle=args.shuffle, + num_samples=args.num_samples, + num_shards=device_num, + shard_id=rank_id, + num_parallel_workers=args.num_parallel_workers, + download=args.dataset_download, + num_aug_repeats=args.aug_repeats, + ) + + if args.num_classes is None: + num_classes = dataset_train.num_classes() + else: + num_classes = args.num_classes + + # create transforms + num_aug_splits = 0 + if args.aug_splits > 0: + assert args.aug_splits == 3, "Currently, only support 3 splits of augmentation" + assert args.auto_augment is not None, "aug_splits should be set with one auto_augment" + num_aug_splits = args.aug_splits + + transform_list = create_transforms( + dataset_name=args.dataset, + is_training=True, + image_resize=args.image_resize, + scale=args.scale, + ratio=args.ratio, + hflip=args.hflip, + vflip=args.vflip, + color_jitter=args.color_jitter, + interpolation=args.interpolation, + auto_augment=args.auto_augment, + mean=args.mean, + std=args.std, + re_prob=args.re_prob, + re_scale=args.re_scale, + re_ratio=args.re_ratio, + re_value=args.re_value, + re_max_attempts=args.re_max_attempts, + separate=num_aug_splits > 0, + ) + + # load dataset + loader_train = create_loader( + dataset=dataset_train, + batch_size=args.batch_size, + drop_remainder=args.drop_remainder, + is_training=True, + mixup=args.mixup, + cutmix=args.cutmix, + cutmix_prob=args.cutmix_prob, + num_classes=num_classes, + transform=transform_list, + num_parallel_workers=args.num_parallel_workers, + separate=num_aug_splits > 0, + ) + + if args.val_while_train: + dataset_eval = create_dataset( + name=args.dataset, + root=args.data_dir, + split=args.val_split, + num_shards=device_num, + shard_id=rank_id, + num_parallel_workers=args.num_parallel_workers, + download=args.dataset_download, + ) + + transform_list_eval = create_transforms( + dataset_name=args.dataset, + is_training=False, + image_resize=args.image_resize, + crop_pct=args.crop_pct, + interpolation=args.interpolation, + mean=args.mean, + std=args.std, + ) + + loader_eval = create_loader( + dataset=dataset_eval, + batch_size=args.batch_size, + drop_remainder=False, + is_training=False, + transform=transform_list_eval, + num_parallel_workers=args.num_parallel_workers, + ) + # validation dataset count + eval_count = dataset_eval.get_dataset_size() + if args.distribute: + all_reduce = AllReduceSum() + eval_count = all_reduce(Tensor(eval_count, ms.int32)) + else: + loader_eval = None + eval_count = None + + num_batches = loader_train.get_dataset_size() + # Train dataset count + train_count = dataset_train.get_dataset_size() + if args.distribute: + all_reduce = AllReduceSum() + train_count = all_reduce(Tensor(train_count, ms.int32)) + + # create model + network = create_model( + model_name=args.model, + num_classes=num_classes, + in_channels=args.in_channels, + drop_rate=args.drop_rate, + drop_path_rate=args.drop_path_rate, + pretrained=args.pretrained, + checkpoint_path=args.ckpt_path, + ema=args.ema, + ) + + num_params = sum([param.size for param in network.get_parameters()]) + + # create loss + loss = create_loss( + name=args.loss, + reduction=args.reduction, + label_smoothing=args.label_smoothing, + aux_factor=args.aux_factor, + ) + + # create learning rate schedule + lr_scheduler = create_scheduler( + num_batches, + scheduler=args.scheduler, + lr=args.lr, + min_lr=args.min_lr, + warmup_epochs=args.warmup_epochs, + warmup_factor=args.warmup_factor, + decay_epochs=args.decay_epochs, + decay_rate=args.decay_rate, + milestones=args.multi_step_decay_milestones, + num_epochs=args.epoch_size, + num_cycles=args.num_cycles, + cycle_decay=args.cycle_decay, + lr_epoch_stair=args.lr_epoch_stair, + ) + + # resume training if ckpt_path is given + if args.ckpt_path != "" and args.resume_opt: + opt_ckpt_path = os.path.join(args.ckpt_save_dir, f"optim_{args.model}.ckpt") + else: + opt_ckpt_path = "" + + # create optimizer + # TODO: consistent naming opt, name, dataset_name + if ( + args.loss_scale_type == "fixed" + and args.drop_overflow_update is False + and not require_customized_train_step( + args.ema, + args.clip_grad, + args.gradient_accumulation_steps, + args.amp_cast_list, + ) + ): + optimizer_loss_scale = args.loss_scale + else: + optimizer_loss_scale = 1.0 + optimizer = create_optimizer( + network.trainable_params(), + opt=args.opt, + lr=lr_scheduler, + weight_decay=args.weight_decay, + momentum=args.momentum, + nesterov=args.use_nesterov, + filter_bias_and_bn=args.filter_bias_and_bn, + loss_scale=optimizer_loss_scale, + checkpoint_path=opt_ckpt_path, + eps=args.eps, + ) + + # Define eval metrics. + metrics = get_metrics(num_classes) + + # create trainer + trainer = create_trainer( + network, + loss, + optimizer, + metrics, + amp_level=args.amp_level, + amp_cast_list=args.amp_cast_list, + loss_scale_type=args.loss_scale_type, + loss_scale=args.loss_scale, + drop_overflow_update=args.drop_overflow_update, + ema=args.ema, + ema_decay=args.ema_decay, + clip_grad=args.clip_grad, + clip_value=args.clip_value, + gradient_accumulation_steps=args.gradient_accumulation_steps, + ) + + # callback + # save checkpoint, summary training loss + # record val acc and do model selection if val dataset is available + begin_step = 0 + begin_epoch = 0 + if args.ckpt_path != "": + begin_step = optimizer.global_step.asnumpy()[0] + begin_epoch = args.ckpt_path.split("/")[-1].split("-")[1].split("_")[0] + begin_epoch = int(begin_epoch) + + summary_dir = f"./{args.ckpt_save_dir}/summary" + assert ( + args.ckpt_save_policy != "top_k" or args.val_while_train is True + ), "ckpt_save_policy is top_k, val_while_train must be True." + state_cb = StateMonitor( + trainer, + model_name=args.model, + model_ema=args.ema, + last_epoch=begin_epoch, + dataset_sink_mode=args.dataset_sink_mode, + dataset_val=loader_eval, + metric_name=list(metrics.keys()), + val_interval=args.val_interval, + ckpt_save_dir=args.ckpt_save_dir, + ckpt_save_interval=args.ckpt_save_interval, + ckpt_save_policy=args.ckpt_save_policy, + ckpt_keep_max=args.keep_checkpoint_max, + summary_dir=summary_dir, + log_interval=args.log_interval, + rank_id=rank_id, + device_num=device_num, + ) + + callbacks = [state_cb] + essential_cfg_msg = "\n".join( + [ + "Essential Experiment Configurations:", + f"MindSpore mode[GRAPH(0)/PYNATIVE(1)]: {args.mode}", + f"Distributed mode: {args.distribute}", + f"Number of devices: {device_num if device_num is not None else 1}", + f"Number of training samples: {train_count}", + f"Number of validation samples: {eval_count}", + f"Number of classes: {num_classes}", + f"Number of batches: {num_batches}", + f"Batch size: {args.batch_size}", + f"Auto augment: {args.auto_augment}", + f"MixUp: {args.mixup}", + f"CutMix: {args.cutmix}", + f"Model: {args.model}", + f"Model parameters: {num_params}", + f"Number of epochs: {args.epoch_size}", + f"Optimizer: {args.opt}", + f"Learning rate: {args.lr}", + f"LR Scheduler: {args.scheduler}", + f"Momentum: {args.momentum}", + f"Weight decay: {args.weight_decay}", + f"Auto mixed precision: {args.amp_level}", + f"Loss scale: {args.loss_scale}({args.loss_scale_type})", + ] + ) + logger.info(essential_cfg_msg) + save_args(args, os.path.join(args.ckpt_save_dir, f"{args.model}.yaml"), rank_id) + + if args.ckpt_path != "": + logger.info(f"Resume training from {args.ckpt_path}, last step: {begin_step}, last epoch: {begin_epoch}") + else: + logger.info("Start training") + + trainer.train(args.epoch_size, loader_train, callbacks=callbacks, dataset_sink_mode=args.dataset_sink_mode) + +def C2netMultiObsToEnv(multi_data_url, data_dir): + #--multi_data_url is json data, need to do json parsing for multi_data_url + print(multi_data_url) + multi_data_json = json.loads(multi_data_url) + for i in range(len(multi_data_json)): + zipfile_path = data_dir + "/" + multi_data_json[i]["dataset_name"] + try: + mox.file.copy(multi_data_json[i]["dataset_url"], zipfile_path) + print("Successfully Download {} to {}".format(multi_data_json[i]["dataset_url"],zipfile_path)) + #get filename and unzip the dataset + filename = os.path.splitext(multi_data_json[i]["dataset_name"])[0] + filePath = data_dir + "/" + filename + if not os.path.exists(filePath): + os.makedirs(filePath) + os.system("unzip {} -d {}".format(zipfile_path, filePath)) + + except Exception as e: + print('moxing download {} to {} failed: '.format( + multi_data_json[i]["dataset_url"], zipfile_path) + str(e)) + #Set a cache file to determine whether the data has been copied to obs. + #If this file exists during multi-card training, there is no need to copy the dataset multiple times. + f = open("/cache/download_input.txt", 'w') + f.close() + try: + if os.path.exists("/cache/download_input.txt"): + print("download_input succeed") + except Exception as e: + print("download_input failed") + return + + +def DownloadFromQizhi(multi_data_url, data_dir): + device_num = int(os.getenv('RANK_SIZE')) + if device_num == 1: + C2netMultiObsToEnv(multi_data_url,data_dir) + context.set_context(mode=context.GRAPH_MODE,device_target=args.device_target) + if device_num > 1: + # set device_id and init for multi-card training + context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target, device_id=int(os.getenv('ASCEND_DEVICE_ID'))) + context.reset_auto_parallel_context() + context.set_auto_parallel_context(device_num = device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, parameter_broadcast=True) + init() + #Copying obs data does not need to be executed multiple times, just let the 0th card copy the data + local_rank=int(os.getenv('RANK_ID')) + if local_rank%8==0: + C2netMultiObsToEnv(multi_data_url,data_dir) + #If the cache file does not exist, it means that the copy data has not been completed, + #and Wait for 0th card to finish copying data + while not os.path.exists("/cache/download_input.txt"): + time.sleep(1) + return + + +def EnvToObs(train_dir, obs_train_url): + try: + mox.file.copy_parallel(train_dir, obs_train_url) + print("Successfully Upload {} to {}".format(train_dir, + obs_train_url)) + except Exception as e: + print('moxing upload {} to {} failed: '.format(train_dir, + obs_train_url) + str(e)) + return + + +def UploadToQizhi(train_dir, obs_train_url): + device_num = int(os.getenv('RANK_SIZE')) + local_rank=int(os.getenv('RANK_ID')) + if device_num == 1: + EnvToObs(train_dir, obs_train_url) + if device_num > 1: + if local_rank%8==0: + EnvToObs(train_dir, obs_train_url) + return + +if __name__ == "__main__": + args = parse_args() + + # os.system("export MINDSPORE_DUMP_CONFIG = "{}"".format(args.dump_path)) + + + os.environ['MINDSPORE_DUMP_CONFIG'] = '/cache/code/levit_new/data_dump.json' + os.environ['MS_DIAGNOSTIC_DATA_PATH'] = '/cache/output' + + # modelarts + data_dir = '/cache/dataset' + train_dir = '/cache/output' + ckpt_url = '/cache/checkpoint.ckpt' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + if not os.path.exists(train_dir): + os.makedirs(train_dir) + print(args.multi_data_url) + + + + DownloadFromQizhi(args.multi_data_url, data_dir) + + train(args) + + UploadToQizhi(train_dir,args.train_url) + + # # data sync for cloud platform if enabled + # if args.enable_modelarts: + # import moxing as mox + + # args.data_dir = f"/cache/{args.data_url}" + # mox.file.copy_parallel(src_url=os.path.join(args.data_url, args.dataset), dst_url=args.data_dir) + + # # core training + # train(args) + + # if args.enable_modelarts: + # mox.file.copy_parallel(src_url=args.ckpt_save_dir, dst_url=args.train_url) \ No newline at end of file