From 47390b8f53a32eebfe41424af2dc1bc1033786f3 Mon Sep 17 00:00:00 2001 From: "song.jian" Date: Mon, 5 Dec 2022 14:15:46 +0800 Subject: [PATCH 1/5] batch_size cong 512 xiugaidao 256 Signed-off-by: song.jian --- cv/classification/resnest50/pytorch/train_resnest50_amp_dist.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cv/classification/resnest50/pytorch/train_resnest50_amp_dist.sh b/cv/classification/resnest50/pytorch/train_resnest50_amp_dist.sh index af76da2a5..3331e037e 100755 --- a/cv/classification/resnest50/pytorch/train_resnest50_amp_dist.sh +++ b/cv/classification/resnest50/pytorch/train_resnest50_amp_dist.sh @@ -26,5 +26,5 @@ fi cd ${ROOT_DIR} python3 $PYTHONARG ${ROOT_DIR}/run_train.py \ --model resnest50 --dali --dali-cpu --data-path $DATA_PATH \ - --opt fused_sgd --batch-size 512 --lr 0.0125 \ + --opt fused_sgd --batch-size 256 --lr 0.0125 \ --amp --nhwc "$@" -- Gitee From a4bedf6a41c074597716de5fb3c60414958fc28d Mon Sep 17 00:00:00 2001 From: "song.jian" Date: Mon, 5 Dec 2022 17:03:36 +0800 Subject: [PATCH 2/5] update script_name amp_4cards.sh amp_8cards.sh fp32_4cards.sh fp32_8cards.sh fp32_16cards.sh Signed-off-by: song.jian --- cv/classification/resnet50/pytorch/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cv/classification/resnet50/pytorch/README.md b/cv/classification/resnet50/pytorch/README.md index c388ce51c..85bd0f571 100644 --- a/cv/classification/resnet50/pytorch/README.md +++ b/cv/classification/resnet50/pytorch/README.md @@ -21,17 +21,17 @@ bash amp_1card.sh --data-path /path/to/imagenet ``` ### Multiple GPUs on one machine ```bash -bash fp32_4card.sh --data-path /path/to/imagenet -bash fp32_8card.sh --data-path /path/to/imagenet +bash fp32_4cards.sh --data-path /path/to/imagenet +bash fp32_8cards.sh --data-path /path/to/imagenet ``` ### Multiple GPUs on one machine (AMP) ```bash -bash amp_4card.sh --data-path /path/to/imagenet -bash amp_8card.sh --data-path /path/to/imagenet +bash amp_4cards.sh --data-path /path/to/imagenet +bash amp_8cards.sh --data-path /path/to/imagenet ``` ### Multiple GPUs on two machines ```bash -bash fp32_16card.sh --data-path /path/to/imagenet +bash fp32_16cards.sh --data-path /path/to/imagenet ``` ## Results on BI-V100 -- Gitee From 699a559d92447852ef2aff06f69353ad620cf41f Mon Sep 17 00:00:00 2001 From: "song.jian" Date: Mon, 5 Dec 2022 17:27:25 +0800 Subject: [PATCH 3/5] train_dist.sh modify dist_train.sh add step: apt install dos2unix Signed-off-by: song.jian --- cv/detection/autoassign/pytorch/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cv/detection/autoassign/pytorch/README.md b/cv/detection/autoassign/pytorch/README.md index 3037ca1ca..02a110b62 100755 --- a/cv/detection/autoassign/pytorch/README.md +++ b/cv/detection/autoassign/pytorch/README.md @@ -17,6 +17,7 @@ $ MMCV_WITH_OPS=1 python3 setup.py build && cp build/lib.linux*/mmcv/_ext.cpytho ```bash $ cd /path/to/modelzoo/cv/detection/autoassign/pytorch $ mkdir -p data && cd data +$ apt install dos2unix # Download from homepage of coco: https://cocodataset.org/ ``` @@ -30,7 +31,7 @@ $ python3 train.py [training args] # config file can be found in ### Multiple GPUs on one machine ```bash -$ bash train_dist.sh [training args] # config file can be found in the configs directory +$ bash dist_train.sh [training args] # config file can be found in the configs directory ``` ## Reference -- Gitee From c752e947fc41cce233a80ea2c16c7e7def2c15b0 Mon Sep 17 00:00:00 2001 From: "song.jian" Date: Mon, 5 Dec 2022 17:50:15 +0800 Subject: [PATCH 4/5] train.py modify "import torchvision as torchvision" Signed-off-by: song.jian --- cv/classification/efficientb4/pytorch/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cv/classification/efficientb4/pytorch/train.py b/cv/classification/efficientb4/pytorch/train.py index 3528f9483..449cf2544 100755 --- a/cv/classification/efficientb4/pytorch/train.py +++ b/cv/classification/efficientb4/pytorch/train.py @@ -20,7 +20,7 @@ except: from torch import nn import torch.distributed as dist -import _torchvision as torchvision +import torchvision as torchvision import utils -- Gitee From 5ff9fba8b4008463202de84dc5d7819d01fc360c Mon Sep 17 00:00:00 2001 From: "song.jian" Date: Mon, 5 Dec 2022 17:58:14 +0800 Subject: [PATCH 5/5] update timm editon pip install timm==0.4.12 Signed-off-by: song.jian --- cv/classification/convnext/pytorch/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cv/classification/convnext/pytorch/README.md b/cv/classification/convnext/pytorch/README.md index de160e208..1c1d094c1 100644 --- a/cv/classification/convnext/pytorch/README.md +++ b/cv/classification/convnext/pytorch/README.md @@ -5,7 +5,7 @@ The ConvNeXT model was proposed in [A ConvNet for the 2020s](https://arxiv.org/a ## Step 1: Installing ```bash -pip install timm==0.4.5 tensorboardX six torch torchvision +pip install timm==0.4.12 tensorboardX six torch torchvision ``` Sign up and login in [imagenet official website](https://www.image-net.org/index.php), then choose 'Download' to download the whole imagenet dataset. Specify `/path/to/imagenet` to your imagenet path in later training process. -- Gitee