diff --git a/.gitignore b/.gitignore index 20a41310..37895038 100644 --- a/.gitignore +++ b/.gitignore @@ -132,3 +132,9 @@ dmypy.json # IDEs .idea/ .vscode/ + +# outputs +rank_*/ +ckpt/ +output/ +outputs/ diff --git a/README.md b/README.md index e059889c..44fc204b 100644 --- a/README.md +++ b/README.md @@ -157,7 +157,7 @@ In contrast, [pynative mode](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advan **Mixed Mode**: -[Pynative mode with ms_function ](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advanced/pynative_graph/combine.html) is a mixed mode for comprising flexibility and efficiency in MindSpore. To apply pynative mode with ms_function for training, please run `train_with_func.py`, e.g., +[PyNative mode with mindspore.jit](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advanced/pynative_graph/combine.html) is a mixed mode for comprising flexibility and efficiency in MindSpore. To apply pynative mode with mindspore.jit for training, please run `train_with_func.py`, e.g., ```shell python train_with_func.py --model=resnet50 --dataset=cifar10 --dataset_download --epoch_size=10 diff --git a/README_CN.md b/README_CN.md index 9a1f3922..8b28062c 100644 --- a/README_CN.md +++ b/README_CN.md @@ -157,7 +157,7 @@ MindCV是一个基于 [MindSpore](https://www.mindspore.cn/) 开发的,致力 **混合模式** -[基于ms_function的混合模式](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advanced/pynative_graph/combine.html) 是兼顾了MindSpore的效率和灵活的混合模式。用户可通过使用`train_with_func.py`文件来使用该混合模式进行训练。 +[基于mindspore.jit的混合模式](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advanced/pynative_graph/combine.html) 是兼顾了MindSpore的效率和灵活的混合模式。用户可通过使用`train_with_func.py`文件来使用该混合模式进行训练。 ```shell python train_with_func.py --model=resnet50 --dataset=cifar10 --dataset_download --epoch_size=10 diff --git a/docs/en/index.md b/docs/en/index.md index 11c9a73d..c5ad4090 100644 --- a/docs/en/index.md +++ b/docs/en/index.md @@ -154,7 +154,7 @@ It is easy to train your model on a standard or customized dataset using `train. !!! warning "Mixed Mode" - [Pynative mode with ms_function](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advanced/pynative_graph/combine.html) is a mixed mode for comprising flexibility and efficiency in MindSpore. To apply pynative mode with ms_function for training, please run `train_with_func.py`, e.g., + [PyNative mode with mindspore.jit](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advanced/pynative_graph/combine.html) is a mixed mode for comprising flexibility and efficiency in MindSpore. To apply pynative mode with mindspore.jit for training, please run `train_with_func.py`, e.g., ```shell python train_with_func.py --model=resnet50 --dataset=cifar10 --dataset_download --epoch_size=10 diff --git a/docs/zh/index.md b/docs/zh/index.md index f6d703a9..a0811ed5 100644 --- a/docs/zh/index.md +++ b/docs/zh/index.md @@ -153,7 +153,7 @@ MindCV是一个基于 [MindSpore](https://www.mindspore.cn/) 开发的,致力 !!! warning "混合模式" - [基于ms_function的混合模式](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advanced/pynative_graph/combine.html) 是兼顾了MindSpore的效率和灵活的混合模式。用户可通过使用`train_with_func.py`文件来使用该混合模式进行训练。 + [基于mindspore.jit的混合模式](https://www.mindspore.cn/tutorials/zh-CN/r1.8/advanced/pynative_graph/combine.html) 是兼顾了MindSpore的效率和灵活的混合模式。用户可通过使用`train_with_func.py`文件来使用该混合模式进行训练。 ```shell python train_with_func.py --model=resnet50 --dataset=cifar10 --dataset_download --epoch_size=10 diff --git a/examples/train_parallel_with_func_example.py b/examples/train_parallel_with_func_example.py index 73995926..b6be699a 100644 --- a/examples/train_parallel_with_func_example.py +++ b/examples/train_parallel_with_func_example.py @@ -14,7 +14,12 @@ from mindcv.loss import create_loss from mindcv.models import create_model from mindcv.optim import create_optimizer -from mindcv.utils import Allreduce +from mindcv.utils import AllReduceSum + +try: + from mindspore import jit +except ImportError: + from mindspore import ms_function as jit def main(): @@ -117,7 +122,7 @@ def forward_fn(data, label): grad_reducer = nn.DistributedGradReducer(optimizer.parameters, mean, degree) # Define function of one-step training, - @ms.ms_function + @jit def train_step_parallel(data, label): (loss, _), grads = grad_fn(data, label) grads = grad_reducer(grads) @@ -143,7 +148,7 @@ def test_epoch(network, dataset): correct += (pred.argmax(1) == label).asnumpy().sum() else: # one-hot or soft label correct += (pred.argmax(1) == label.argmax(1)).asnumpy().sum() - all_reduce = Allreduce() + all_reduce = AllReduceSum() correct = all_reduce(Tensor(correct, ms.float32)) total = all_reduce(Tensor(total, ms.float32)) correct /= total diff --git a/examples/train_with_func_example.py b/examples/train_with_func_example.py index 110051fe..b64785e4 100644 --- a/examples/train_with_func_example.py +++ b/examples/train_with_func_example.py @@ -12,6 +12,11 @@ from mindcv.models import create_model from mindcv.optim import create_optimizer +try: + from mindspore import jit +except ImportError: + from mindspore import ms_function as jit + def main(): ms.set_seed(1) @@ -96,7 +101,7 @@ def forward_fn(data, label): grad_fn = ops.value_and_grad(forward_fn, None, optimizer.parameters, has_aux=True) # Define function of one-step training, - @ms.ms_function + @jit def train_step(data, label): (loss, _), grads = grad_fn(data, label) loss = ops.depend(loss, optimizer(grads)) diff --git a/mindcv/models/cait.py b/mindcv/models/cait.py index a9c90ab6..40ca6ec1 100644 --- a/mindcv/models/cait.py +++ b/mindcv/models/cait.py @@ -13,6 +13,7 @@ from mindspore.common.initializer import TruncatedNormal from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.drop_path import DropPath from .layers.mlp import Mlp from .layers.patch_embed import PatchEmbed @@ -41,12 +42,12 @@ def _cfg(url='', **kwargs): default_cfgs = { "cait_xxs24_224": _cfg(url=''), - "cait_xs24_384": _cfg(url=''), + "cait_xs24_384": _cfg(url='', input_size=(3, 384, 384)), "cait_s24_224": _cfg(url=''), - "cait_s24_384": _cfg(url=''), - "cait_s36_384": _cfg(url=''), - "cait_m36_384": _cfg(url=''), - "cait_m48_448": _cfg(url=''), + "cait_s24_384": _cfg(url='', input_size=(3, 384, 384)), + "cait_s36_384": _cfg(url='', input_size=(3, 384, 384)), + "cait_m36_384": _cfg(url='', input_size=(3, 384, 384)), + "cait_m48_448": _cfg(url='', input_size=(3, 448, 448)), } @@ -67,9 +68,9 @@ def __init__(self, self.q = nn.Dense(dim, dim, has_bias=qkv_bias) self.k = nn.Dense(dim, dim, has_bias=qkv_bias) self.v = nn.Dense(dim, dim, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(1 - attn_drop_rate) + self.attn_drop = Dropout(p=attn_drop_rate) self.proj = nn.Dense(dim, dim) - self.proj_drop = nn.Dropout(1 - proj_drop_rate) + self.proj_drop = Dropout(p=proj_drop_rate) self.softmax = nn.Softmax(axis=-1) self.attn_matmul_v = ops.BatchMatMul() @@ -156,14 +157,14 @@ def __init__(self, self.scale = qk_scale or head_dim ** -0.5 self.qkv = nn.Dense(dim, dim * 3, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(1 - attn_drop_rate) + self.attn_drop = Dropout(p=attn_drop_rate) self.proj = nn.Dense(dim, dim, has_bias=False) self.proj_l = nn.Dense(num_heads, num_heads, has_bias=False) self.proj_w = nn.Dense(num_heads, num_heads, has_bias=False) - self.proj_drop = nn.Dropout(1 - proj_drop_rate) + self.proj_drop = Dropout(p=proj_drop_rate) self.softmax = nn.Softmax(axis=-1) @@ -271,7 +272,7 @@ def __init__(self, zeros = ops.Zeros() self.cls_token = Parameter(zeros((1, 1, embed_dim), ms.float32)) self.pos_embed = Parameter(zeros((1, num_patches, embed_dim), ms.float32)) - self.pos_drop = nn.Dropout(1 - drop_rate) + self.pos_drop = Dropout(p=drop_rate) dpr = [drop_path_rate for i in range(depth)] diff --git a/mindcv/models/coat.py b/mindcv/models/coat.py index b2b30415..09645d33 100644 --- a/mindcv/models/coat.py +++ b/mindcv/models/coat.py @@ -10,10 +10,11 @@ import mindspore.common.initializer as init import mindspore.nn as nn import mindspore.ops as ops -from mindspore import Tensor, ms_function +from mindspore import Tensor from mindspore.numpy import split from .helpers import load_pretrained +from .layers.compatibility import Dropout, Interpolate from .layers.drop_path import DropPath from .layers.identity import Identity from .registry import register_model @@ -70,7 +71,7 @@ def __init__( self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features, has_bias=True) self.act = nn.GELU(approximate=False) self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=True) - self.drop = nn.Dropout(keep_prob=1.0 - drop) + self.drop = Dropout(p=drop) def construct(self, x: Tensor) -> Tensor: x = self.fc1(x) @@ -118,7 +119,6 @@ def __init__( self.idx1 = self.channel_splits[0] self.idx2 = self.channel_splits[0] + self.channel_splits[1] - @ms_function def construct(self, q, v, size) -> Tensor: B, h, N, Ch = q.shape @@ -167,9 +167,9 @@ def __init__( self.q = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) self.k = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(keep_prob=1 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(dim, dim) - self.proj_drop = nn.Dropout(keep_prob=1 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) self.softmax = nn.Softmax(axis=-1) self.batch_matmul = ops.BatchMatMul() @@ -323,6 +323,7 @@ def __init__( shared_crpe=shared_crpes[3] ) self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.interpolate_fn = Interpolate(mode="bilinear", align_corners=True) self.norm22 = nn.LayerNorm((dims[1],), epsilon=1e-6) self.norm23 = nn.LayerNorm((dims[2],), epsilon=1e-6) @@ -349,10 +350,7 @@ def interpolate(self, x, output_size, size) -> Tensor: img_tokens = ops.transpose(img_tokens, (0, 2, 1)) img_tokens = ops.reshape(img_tokens, (B, C, H, W)) - img_tokens = ops.interpolate(img_tokens, - sizes=output_size, - mode='bilinear' - ) + img_tokens = self.interpolate_fn(img_tokens, size=output_size) img_tokens = ops.reshape(img_tokens, (B, C, -1)) img_tokens = ops.transpose(img_tokens, (0, 2, 1)) diff --git a/mindcv/models/convit.py b/mindcv/models/convit.py index f94ca253..45884b9c 100644 --- a/mindcv/models/convit.py +++ b/mindcv/models/convit.py @@ -11,6 +11,7 @@ from mindspore.ops import constexpr from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.drop_path import DropPath from .layers.identity import Identity from .layers.mlp import Mlp @@ -85,10 +86,10 @@ def __init__( self.k = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(keep_prob=1.0 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(in_channels=dim, out_channels=dim) self.pos_proj = nn.Dense(in_channels=3, out_channels=num_heads) - self.proj_drop = nn.Dropout(keep_prob=1.0 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) self.gating_param = Parameter(ops.ones((num_heads), ms.float32)) self.softmax = nn.Softmax(axis=-1) self.batch_matmul = ops.BatchMatMul() @@ -144,9 +145,9 @@ def __init__( self.q = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) self.k = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(keep_prob=1.0 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(in_channels=dim, out_channels=dim) - self.proj_drop = nn.Dropout(keep_prob=1.0 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) self.softmax = nn.Softmax(axis=-1) self.batch_matmul = ops.BatchMatMul() @@ -261,7 +262,7 @@ def __init__( self.num_patches = self.patch_embed.num_patches self.cls_token = Parameter(ops.Zeros()((1, 1, embed_dim), ms.float32)) - self.pos_drop = nn.Dropout(keep_prob=1.0 - drop_rate) + self.pos_drop = Dropout(p=drop_rate) if self.use_pos_embed: self.pos_embed = Parameter(ops.Zeros()((1, self.num_patches, embed_dim), ms.float32)) diff --git a/mindcv/models/crossvit.py b/mindcv/models/crossvit.py index 591a7bd8..e1db068b 100644 --- a/mindcv/models/crossvit.py +++ b/mindcv/models/crossvit.py @@ -14,6 +14,7 @@ from mindspore.common.initializer import TruncatedNormal from .helpers import load_pretrained +from .layers.compatibility import Dropout, Interpolate from .layers.drop_path import DropPath from .layers.helpers import to_2tuple from .layers.identity import Identity @@ -55,9 +56,9 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.) self.scale = head_dim ** -0.5 self.qkv = nn.Dense(dim, dim * 3, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(1.0 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(dim, dim) - self.proj_drop = nn.Dropout(1.0 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) def construct(self, x: Tensor) -> Tensor: B, N, C = x.shape @@ -157,9 +158,9 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0. self.wq = nn.Dense(dim, dim, has_bias=qkv_bias) self.wk = nn.Dense(dim, dim, has_bias=qkv_bias) self.wv = nn.Dense(dim, dim, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(1.0 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(dim, dim) - self.proj_drop = nn.Dropout(1.0 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) def construct(self, x: Tensor) -> Tensor: B, N, C = x.shape # 3,3,16 @@ -325,6 +326,7 @@ def __init__(self, img_size=(224, 224), patch_size=(8, 16), in_channels=3, num_c num_patches = _compute_num_patches(img_size, patch_size) self.num_branches = len(patch_size) + self.interpolate = Interpolate(mode="bilinear", align_corners=True) patch_embed = [] if hybrid_backbone is None: @@ -346,7 +348,7 @@ def __init__(self, img_size=(224, 224), patch_size=(8, 16), in_channels=3, num_c d.append(c) d = tuple(d) self.cls_token = ms.ParameterTuple(d) - self.pos_drop = nn.Dropout(1.0 - drop_rate) + self.pos_drop = Dropout(p=drop_rate) total_depth = sum([sum(x[-2:]) for x in depth]) dpr = np.linspace(0, drop_path_rate, total_depth) # stochastic depth decay rule @@ -403,8 +405,7 @@ def forward_features(self, x: Tensor) -> Tensor: xs = [] # print(x) for i in range(self.num_branches): - x_ = ops.interpolate(x, sizes=(self.img_size[i], self.img_size[i]), mode='bilinear') if H != self.img_size[ - i] else x + x_ = self.interpolate(x, size=(self.img_size[i], self.img_size[i])) if H != self.img_size[i] else x tmp = self.patch_embed[i](x_) z = self.cls_token[i].shape y = Tensor(np.ones((B, z[1], z[2])), dtype=mstype.float32) diff --git a/mindcv/models/densenet.py b/mindcv/models/densenet.py index 5a211c7d..f8b29426 100644 --- a/mindcv/models/densenet.py +++ b/mindcv/models/densenet.py @@ -11,6 +11,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -61,7 +62,7 @@ def __init__( self.conv2 = nn.Conv2d(bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, pad_mode="pad", padding=1) self.drop_rate = drop_rate - self.dropout = nn.Dropout(keep_prob=1 - self.drop_rate) + self.dropout = Dropout(p=self.drop_rate) def construct(self, features: Tensor) -> Tensor: bottleneck = self.conv1(self.relu1(self.norm1(features))) diff --git a/mindcv/models/edgenext.py b/mindcv/models/edgenext.py index 56a22c28..82ed070d 100644 --- a/mindcv/models/edgenext.py +++ b/mindcv/models/edgenext.py @@ -13,6 +13,7 @@ from mindspore import Parameter, Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Dropout, Split from .layers.drop_path import DropPath from .layers.identity import Identity from .registry import register_model @@ -53,20 +54,6 @@ def _cfg(url="", **kwargs): } -def ssplit(x: Tensor, dim, width): - B, C, H, W = x.shape - if C % width == 0: - return ops.split(x, dim, C // width) - else: - begin = 0 - temp = [] - while begin + width < C: - temp.append(x[:, begin : begin + width, :, :]) - begin += width - temp.append(x[:, begin:, :, :]) - return temp - - class LayerNorm(nn.LayerNorm): r"""LayerNorm for channels_first tensors with 2d spatial dimensions (ie N, C, H, W).""" @@ -209,11 +196,25 @@ def __init__( self.gamma = Parameter(Tensor(layer_scale_init_value * np.ones((dim)), ms.float32), requires_grad=True) if layer_scale_init_value > 0 else None self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + self.split = Split(split_size_or_sections=width, output_num=dim // width, axis=1) + + def ssplit(self, x: Tensor, width): + B, C, H, W = x.shape + if C % width == 0: + return self.split(x) + else: + begin = 0 + temp = [] + while begin + width < C: + temp.append(x[:, begin: begin + width, :, :]) + begin += width + temp.append(x[:, begin:, :, :]) + return temp def construct(self, x: Tensor) -> Tensor: input = x - spx = ssplit(x, 1, self.width) + spx = self.ssplit(x, self.width) sp = None out = None for i in range(self.nums): @@ -264,9 +265,9 @@ def __init__( self.temperature = Parameter(Tensor(np.ones((num_heads, 1, 1)), ms.float32)) self.qkv = nn.Dense(dim, dim * 3, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(1 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(dim, dim) - self.proj_drop = nn.Dropout(1 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) def construct(self, x: Tensor) -> Tensor: B, N, C = x.shape @@ -362,8 +363,8 @@ def __init__(self, in_chans=3, num_classes=1000, self.norm = nn.LayerNorm((dims[-1],), epsilon=1e-6) # Final norm layer self.head = nn.Dense(dims[-1], num_classes) - # self.head_dropout = nn.Dropout(kwargs["classifier_dropout"]) - self.head_dropout = nn.Dropout(1.0) + # self.head_dropout = Dropout(kwargs["classifier_dropout"]) + self.head_dropout = Dropout(p=0.0) self.head_init_scale = head_init_scale self._initialize_weights() @@ -408,6 +409,7 @@ def edgenext_xx_small(pretrained: bool = False, num_classes: int = 1000, in_chan depths=[2, 2, 6, 2], dims=[24, 48, 88, 168], expan_ratio=4, + num_classes=num_classes, global_block=[0, 1, 1, 1], global_block_type=['None', 'SDTA', 'SDTA', 'SDTA'], use_pos_embd_xca=[False, True, False, False], diff --git a/mindcv/models/efficientnet.py b/mindcv/models/efficientnet.py index 395b0a13..43e2b26c 100644 --- a/mindcv/models/efficientnet.py +++ b/mindcv/models/efficientnet.py @@ -13,6 +13,7 @@ from .helpers import build_model_with_cfg, make_divisible from .layers.activation import Swish +from .layers.compatibility import Dropout from .layers.drop_path import DropPath from .layers.pooling import GlobalAvgPooling from .layers.squeeze_excite import SqueezeExcite @@ -436,7 +437,7 @@ def __init__( self.features = nn.SequentialCell(layers) self.avgpool = GlobalAvgPooling() - self.dropout = nn.Dropout(1 - dropout_rate) + self.dropout = Dropout(p=dropout_rate) self.mlp_head = nn.Dense(lastconv_output_channels, num_classes) self._initialize_weights() diff --git a/mindcv/models/ghostnet.py b/mindcv/models/ghostnet.py index f0f8fa73..87bab1f6 100644 --- a/mindcv/models/ghostnet.py +++ b/mindcv/models/ghostnet.py @@ -8,6 +8,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained, make_divisible +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .layers.squeeze_excite import SqueezeExcite from .registry import register_model @@ -252,7 +253,7 @@ def __init__( self.act2 = nn.ReLU() self.flatten = nn.Flatten() if self.drop_rate > 0.0: - self.dropout = nn.Dropout(keep_prob=1 - drop_rate) + self.dropout = Dropout(p=drop_rate) self.classifier = nn.Dense(out_chs, num_classes) self._initialize_weights() diff --git a/mindcv/models/googlenet.py b/mindcv/models/googlenet.py index c9f6925a..25256636 100644 --- a/mindcv/models/googlenet.py +++ b/mindcv/models/googlenet.py @@ -10,6 +10,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -109,7 +110,7 @@ def __init__( self.fc2 = nn.Dense(1024, num_classes) self.flatten = nn.Flatten() self.relu = nn.ReLU() - self.dropout = nn.Dropout(1 - drop_rate) + self.dropout = Dropout(p=drop_rate) def construct(self, x: Tensor) -> Tensor: x = self.avg_pool(x) @@ -170,7 +171,7 @@ def __init__( self.aux2 = InceptionAux(528, num_classes, drop_rate=drop_rate_aux) self.pool = GlobalAvgPooling() - self.dropout = nn.Dropout(keep_prob=1 - drop_rate) + self.dropout = Dropout(p=drop_rate) self.classifier = nn.Dense(1024, num_classes) self._initialize_weights() diff --git a/mindcv/models/inception_v3.py b/mindcv/models/inception_v3.py index 12802022..8e15c5df 100644 --- a/mindcv/models/inception_v3.py +++ b/mindcv/models/inception_v3.py @@ -9,6 +9,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -266,7 +267,7 @@ def __init__( self.inception7c = InceptionE(2048) self.pool = GlobalAvgPooling() - self.dropout = nn.Dropout(keep_prob=1 - drop_rate) + self.dropout = Dropout(p=drop_rate) self.num_features = 2048 self.classifier = nn.Dense(self.num_features, num_classes) self._initialize_weights() diff --git a/mindcv/models/inception_v4.py b/mindcv/models/inception_v4.py index b9b52c78..54bf81e3 100644 --- a/mindcv/models/inception_v4.py +++ b/mindcv/models/inception_v4.py @@ -9,6 +9,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -278,7 +279,7 @@ def __init__( self.features = nn.SequentialCell(blocks) self.pool = GlobalAvgPooling() - self.dropout = nn.Dropout(1 - drop_rate) + self.dropout = Dropout(p=drop_rate) self.num_features = 1536 self.classifier = nn.Dense(self.num_features, num_classes) self._initialize_weights() diff --git a/mindcv/models/layers/compatibility.py b/mindcv/models/layers/compatibility.py new file mode 100644 index 00000000..e0df00a5 --- /dev/null +++ b/mindcv/models/layers/compatibility.py @@ -0,0 +1,85 @@ +import inspect + +import mindspore as ms +from mindspore import nn, ops + +__all__ = [ + "Dropout", + "Interpolate", + "Split", +] + + +class Dropout(nn.Dropout): + def __init__(self, p=0.5, dtype=ms.float32): + sig = inspect.signature(super().__init__) + if "keep_prob" in sig.parameters and "p" not in sig.parameters: + super().__init__(keep_prob=1.0-p, dtype=dtype) + elif "p" in sig.parameters: + super().__init__(p=p, dtype=dtype) + else: + raise NotImplementedError( + f"'keep_prob' or 'p' must be the parameter of `mindspore.nn.Dropout`, but got signature of it: {sig}." + ) + + +class Interpolate(nn.Cell): + def __init__(self, scale_factor=None, mode="nearest", align_corners=None, recompute_scale_factor=None): + super().__init__() + sig = inspect.signature(ops.interpolate) + if "sizes" in sig.parameters: + if scale_factor is None and recompute_scale_factor is None: + self.kwargs = dict( + roi=None, + scales=None, + coordinate_transformation_mode="align_corners" if align_corners is True else "half_pixel", + mode=mode, + ) + self.size_name = "sizes" + else: + raise NotImplementedError( + "'scale_factor' and 'recompute_scale_factor' do not supported in mindspore 1.x!" + ) + elif "size" in sig.parameters: + self.kwargs = dict( + scale_factor=scale_factor, + mode=mode, + align_corners=align_corners, + recompute_scale_factor=recompute_scale_factor, + ) + self.size_name = "size" + else: + raise NotImplementedError( + f"'sizes' or 'size' must be the parameter of `mindspore.ops.interpolate`, " + f"but got signature of it: {sig}." + ) + + def construct(self, x, size=None): + if self.size_name == "sizes": + return ops.interpolate(x, sizes=size, **self.kwargs) + elif self.size_name == "size": + return ops.interpolate(x, size=size, **self.kwargs) + else: + return None + + +class Split(nn.Cell): + """Splits the tensor into chunks. + In order to ensure that your code can run on different versions of mindspore, + you need to pass in two sets of redundant parameters. + """ + def __init__(self, split_size_or_sections, output_num, axis=0): + super().__init__() + sig = inspect.signature(ops.split) + if "output_num" in sig.parameters: + self.kwargs = dict(axis=axis, output_num=output_num) + elif "split_size_or_sections" in sig.parameters: + self.kwargs = dict(split_size_or_sections=split_size_or_sections, axis=axis) + else: + raise NotImplementedError( + f"'output_num' or 'split_size_or_sections' must be the parameter of `mindspore.ops.split`, " + f"but got signature of it: {sig}." + ) + + def construct(self, x): + return ops.split(x, **self.kwargs) diff --git a/mindcv/models/layers/drop_path.py b/mindcv/models/layers/drop_path.py index 8bb181ce..ea037473 100644 --- a/mindcv/models/layers/drop_path.py +++ b/mindcv/models/layers/drop_path.py @@ -6,6 +6,8 @@ from mindspore import Tensor, nn, ops from mindspore.numpy import ones +from .compatibility import Dropout + class DropPath(nn.Cell): """DropPath (Stochastic Depth) regularization layers""" @@ -18,7 +20,7 @@ def __init__( super().__init__() self.keep_prob = 1.0 - drop_prob self.scale_by_keep = scale_by_keep - self.dropout = nn.Dropout(self.keep_prob) + self.dropout = Dropout(p=drop_prob) def construct(self, x: Tensor) -> Tensor: if self.keep_prob == 1.0 or not self.training: diff --git a/mindcv/models/layers/mlp.py b/mindcv/models/layers/mlp.py index 222f40b3..7da27a4a 100644 --- a/mindcv/models/layers/mlp.py +++ b/mindcv/models/layers/mlp.py @@ -4,6 +4,8 @@ from mindspore import Tensor, nn +from .compatibility import Dropout + class Mlp(nn.Cell): def __init__( @@ -20,7 +22,7 @@ def __init__( self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features, has_bias=True) self.act = act_layer() self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=True) - self.drop = nn.Dropout(keep_prob=1.0 - drop) + self.drop = Dropout(p=drop) def construct(self, x: Tensor) -> Tensor: x = self.fc1(x) diff --git a/mindcv/models/layers/selective_kernel.py b/mindcv/models/layers/selective_kernel.py index e6ccc2ea..ddf6ebca 100644 --- a/mindcv/models/layers/selective_kernel.py +++ b/mindcv/models/layers/selective_kernel.py @@ -6,6 +6,7 @@ from mindspore import Tensor, nn, ops from ..helpers import make_divisible +from .compatibility import Split from .conv_norm_act import Conv2dNormActivation from .pooling import GlobalAvgPooling @@ -113,6 +114,7 @@ def __init__( assert in_channels % self.num_paths == 0 in_channels = in_channels // self.num_paths groups = min(out_channels, groups) + self.split = Split(split_size_or_sections=self.in_channels // self.num_paths, output_num=self.num_paths, axis=1) self.paths = nn.CellList([ Conv2dNormActivation(in_channels, out_channels, kernel_size=k, stride=stride, groups=groups, @@ -126,7 +128,7 @@ def __init__( def construct(self, x: Tensor) -> Tensor: x_paths = [] if self.split_input: - x_split = ops.split(x, axis=1, output_num=self.num_paths) + x_split = self.split(x) for i, op in enumerate(self.paths): x_paths.append(op(x_split[i])) else: diff --git a/mindcv/models/mixnet.py b/mindcv/models/mixnet.py index 1b446b58..9d9c3534 100644 --- a/mindcv/models/mixnet.py +++ b/mindcv/models/mixnet.py @@ -10,6 +10,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .layers.squeeze_excite import SqueezeExcite from .registry import register_model @@ -339,7 +340,7 @@ def __init__( ]) self.pool = GlobalAvgPooling() - self.dropout = nn.Dropout(keep_prob=1 - drop_rate) + self.dropout = Dropout(p=drop_rate) self.classifier = nn.Dense(feature_size, num_classes) self._initialize_weights() diff --git a/mindcv/models/mlpmixer.py b/mindcv/models/mlpmixer.py index 5c48c0ce..22e0ff7b 100644 --- a/mindcv/models/mlpmixer.py +++ b/mindcv/models/mlpmixer.py @@ -7,6 +7,7 @@ import mindspore.ops as ops from .helpers import load_pretrained +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -50,9 +51,9 @@ def __init__(self, dim, hidden_dim, dropout=0.): self.net = nn.SequentialCell( nn.Dense(dim, hidden_dim), nn.GELU(), - nn.Dropout(keep_prob=1 - dropout), + Dropout(p=dropout), nn.Dense(hidden_dim, dim), - nn.Dropout(keep_prob=1 - dropout) + Dropout(p=dropout) ) def construct(self, x): diff --git a/mindcv/models/mnasnet.py b/mindcv/models/mnasnet.py index c12891b1..ce05cabc 100644 --- a/mindcv/models/mnasnet.py +++ b/mindcv/models/mnasnet.py @@ -9,6 +9,7 @@ from mindspore import Tensor, nn from .helpers import load_pretrained, make_divisible +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -137,7 +138,7 @@ def __init__( ]) self.features = nn.SequentialCell(features) self.pool = GlobalAvgPooling() - self.dropout = nn.Dropout(keep_prob=1 - drop_rate) + self.dropout = Dropout(p=drop_rate) self.classifier = nn.Dense(1280, num_classes) self._initialize_weights() diff --git a/mindcv/models/mobilenet_v2.py b/mindcv/models/mobilenet_v2.py index ce802b66..7f7672a1 100644 --- a/mindcv/models/mobilenet_v2.py +++ b/mindcv/models/mobilenet_v2.py @@ -9,6 +9,7 @@ from mindspore import Tensor, nn from .helpers import load_pretrained, make_divisible +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -219,7 +220,7 @@ def __init__( self.pool = GlobalAvgPooling() self.classifier = nn.SequentialCell([ - nn.Dropout(keep_prob=0.8), # confirmed by paper authors + Dropout(p=0.2), # confirmed by paper authors nn.Dense(last_channels, num_classes), ]) self._initialize_weights() diff --git a/mindcv/models/mobilenet_v3.py b/mindcv/models/mobilenet_v3.py index b9c162e0..d489cb9c 100644 --- a/mindcv/models/mobilenet_v3.py +++ b/mindcv/models/mobilenet_v3.py @@ -9,6 +9,7 @@ from mindspore import Tensor, nn from .helpers import build_model_with_cfg, make_divisible +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .layers.squeeze_excite import SqueezeExcite from .registry import register_model @@ -202,7 +203,7 @@ def __init__( self.classifier = nn.SequentialCell([ nn.Dense(output_channels, last_channels), nn.HSwish(), - nn.Dropout(keep_prob=0.8), + Dropout(p=0.2), nn.Dense(last_channels, num_classes), ]) self._initialize_weights() diff --git a/mindcv/models/mobilevit.py b/mindcv/models/mobilevit.py index 693bd5ee..bd258665 100644 --- a/mindcv/models/mobilevit.py +++ b/mindcv/models/mobilevit.py @@ -10,6 +10,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained, make_divisible +from .layers.compatibility import Dropout, Interpolate from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -211,7 +212,7 @@ def __init__( self.qkv_proj = nn.Dense(in_channels=embed_dim, out_channels=embed_dim * 3, has_bias=bias) - self.attn_dropout = nn.Dropout(keep_prob=1.0 - attn_dropout) + self.attn_dropout = Dropout(p=attn_dropout) self.out_proj = nn.Dense(in_channels=embed_dim, out_channels=embed_dim, has_bias=bias) self.head_dim = embed_dim // num_heads @@ -279,16 +280,16 @@ def __init__( self.pre_norm_mha = nn.SequentialCell( nn.LayerNorm((embed_dim,)), attn_unit, - nn.Dropout(keep_prob=1.0 - dropout) + Dropout(p=dropout) ) self.pre_norm_ffn = nn.SequentialCell( nn.LayerNorm((embed_dim,)), nn.Dense(in_channels=embed_dim, out_channels=ffn_latent_dim, has_bias=True), nn.SiLU(), - nn.Dropout(keep_prob=1.0 - ffn_dropout), + Dropout(p=ffn_dropout), nn.Dense(in_channels=ffn_latent_dim, out_channels=embed_dim, has_bias=True), - nn.Dropout(keep_prob=1.0 - dropout) + Dropout(p=dropout) ) self.embed_dim = embed_dim self.ffn_dim = ffn_latent_dim @@ -412,6 +413,7 @@ def __init__( self.ffn_dropout = ffn_dropout self.n_blocks = n_transformer_blocks self.conv_ksize = conv_ksize + self.interpolate = Interpolate(mode="bilinear", align_corners=True) def unfolding(self, x: Tensor) -> Tuple[Tensor, Dict]: patch_w, patch_h = self.patch_w, self.patch_h @@ -424,7 +426,7 @@ def unfolding(self, x: Tensor) -> Tuple[Tensor, Dict]: interpolate = False if new_w != orig_w or new_h != orig_h: # Note: Padding can be done, but then it needs to be handled in attention function. - x = ops.interpolate(x, size=(new_h, new_w), coordinate_transformation_mode="align_corners", mode="bilinear") + x = self.interpolate(x, size=(new_h, new_w)) interpolate = True # number of patches along width and height @@ -477,12 +479,7 @@ def folding(self, x: Tensor, info_dict: Dict) -> Tensor: # [B*C*n_h, p_h, n_w, p_w] -> [B, C, H, W] x = ops.reshape(x, (batch_size, channels, num_patch_h * self.patch_h, num_patch_w * self.patch_w)) if info_dict["interpolate"]: - x = ops.interpolate( - x, - size=info_dict["orig_size"], - coordinate_transformation_mode="align_corners", - mode="bilinear", - ) + x = self.interpolate(x, size=info_dict["orig_size"]) return x def construct(self, x: Tensor) -> Tensor: @@ -534,7 +531,7 @@ def __init__(self, model_cfg: Dict, num_classes: int = 1000): classifier.append(GlobalAvgPooling()) classifier.append(nn.Flatten()) if 0.0 < model_cfg["cls_dropout"] < 1.0: - classifier.append(nn.Dropout(keep_prob=1 - model_cfg["cls_dropout"])) + classifier.append(Dropout(p=model_cfg["cls_dropout"])) classifier.append(nn.Dense(in_channels=exp_channels, out_channels=num_classes)) self.classifier = nn.SequentialCell(classifier) self._initialize_weights() diff --git a/mindcv/models/nasnet.py b/mindcv/models/nasnet.py index ae606882..97c22dbe 100644 --- a/mindcv/models/nasnet.py +++ b/mindcv/models/nasnet.py @@ -8,6 +8,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -807,7 +808,7 @@ def __init__( ) # 24, 4 self.relu = nn.ReLU() - self.dropout = nn.Dropout(keep_prob=0.5) + self.dropout = Dropout(p=0.5) self.classifier = nn.Dense(in_channels=24 * filters, out_channels=num_classes) self.pool = GlobalAvgPooling() self._initialize_weights() diff --git a/mindcv/models/pit.py b/mindcv/models/pit.py index 7372e938..736940e9 100644 --- a/mindcv/models/pit.py +++ b/mindcv/models/pit.py @@ -15,6 +15,7 @@ from .helpers import load_pretrained from .layers import DropPath, Identity +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -125,9 +126,9 @@ def __init__( self.q = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) self.k = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(keep_prob=1 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(dim, dim) - self.proj_drop = nn.Dropout(keep_prob=1 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) self.softmax = nn.Softmax(axis=-1) self.batchmatmul = ops.BatchMatMul() @@ -198,7 +199,7 @@ def __init__( self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features, has_bias=True) self.act = act_layer() self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=True) - self.drop = nn.Dropout(keep_prob=1.0 - drop) + self.drop = Dropout(p=drop) def construct(self, x): x = self.fc1(x) @@ -312,7 +313,7 @@ def __init__( self.patch_embed = conv_embedding(in_chans, base_dims[0] * heads[0], patch_size, stride, padding) self.cls_token = Parameter(Tensor(np.random.randn(1, 1, base_dims[0] * heads[0]), mstype.float32)) - self.pos_drop = nn.Dropout(keep_prob=1.0 - drop_rate) + self.pos_drop = Dropout(p=drop_rate) self.tile = ops.Tile() self.transformers = nn.CellList([]) diff --git a/mindcv/models/pnasnet.py b/mindcv/models/pnasnet.py index 76bffe36..fc78484b 100644 --- a/mindcv/models/pnasnet.py +++ b/mindcv/models/pnasnet.py @@ -11,6 +11,7 @@ from .helpers import load_pretrained from .layers import GlobalAvgPooling +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -424,7 +425,7 @@ def __init__( self.relu = nn.ReLU() self.pool = GlobalAvgPooling() - self.dropout = nn.Dropout(keep_prob=0.5) + self.dropout = Dropout(p=0.5) self.last_linear = nn.Dense(in_channels=1080, out_channels=num_classes) self._initialize_weights() diff --git a/mindcv/models/poolformer.py b/mindcv/models/poolformer.py index 1226120b..1e0d2ecd 100644 --- a/mindcv/models/poolformer.py +++ b/mindcv/models/poolformer.py @@ -14,6 +14,7 @@ from .helpers import load_pretrained from .layers import DropPath, Identity +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -80,7 +81,7 @@ def __init__( self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, has_bias=bias[0]) self.norm = norm_layer(hidden_features) if norm_layer else Identity() self.act = act_layer(approximate=False) - self.drop = nn.Dropout(1 - drop) + self.drop = Dropout(p=drop) self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, has_bias=bias[1]) self.cls_init_weights() diff --git a/mindcv/models/pvt.py b/mindcv/models/pvt.py index 913ca6e0..721ad75f 100644 --- a/mindcv/models/pvt.py +++ b/mindcv/models/pvt.py @@ -13,6 +13,7 @@ from mindspore.common import initializer as weight_init from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.drop_path import DropPath from .layers.identity import Identity from .layers.mlp import Mlp @@ -68,9 +69,9 @@ def __init__( self.q = nn.Dense(dim, dim, has_bias=qkv_bias) self.kv = nn.Dense(dim, dim * 2, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(1 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(dim, dim) - self.proj_drop = nn.Dropout(1 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) self.qk_batmatmul = ops.BatchMatMul(transpose_b=True) self.batmatmul = ops.BatchMatMul() self.softmax = nn.Softmax(axis=-1) @@ -203,7 +204,7 @@ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, emb cur = 0 b_list = [] self.pos_embed = [] - self.pos_drop = nn.Dropout(1 - drop_rate) + self.pos_drop = Dropout(p=drop_rate) for i in range(num_stages): block = nn.CellList( [Block(dim=embed_dims[i], num_heads=num_heads[i], mlp_ratio=mlp_ratios[i], qkv_bias=qkv_bias, @@ -221,7 +222,7 @@ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, emb embed_dim=embed_dims[0]) num_patches = self.patch_embed1.num_patches self.pos_embed1 = mindspore.Parameter(ops.zeros((1, num_patches, embed_dims[0]), mindspore.float16)) - self.pos_drop1 = nn.Dropout(1 - drop_rate) + self.pos_drop1 = Dropout(p=drop_rate) self.patch_embed2 = PatchEmbed(img_size=img_size // (2 ** (1 + 1)), patch_size=2, @@ -229,7 +230,7 @@ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, emb embed_dim=embed_dims[1]) num_patches = self.patch_embed2.num_patches self.pos_embed2 = mindspore.Parameter(ops.zeros((1, num_patches, embed_dims[1]), mindspore.float16)) - self.pos_drop2 = nn.Dropout(1 - drop_rate) + self.pos_drop2 = Dropout(p=drop_rate) self.patch_embed3 = PatchEmbed(img_size=img_size // (2 ** (2 + 1)), patch_size=2, @@ -237,7 +238,7 @@ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, emb embed_dim=embed_dims[2]) num_patches = self.patch_embed3.num_patches self.pos_embed3 = mindspore.Parameter(ops.zeros((1, num_patches, embed_dims[2]), mindspore.float16)) - self.pos_drop3 = nn.Dropout(1 - drop_rate) + self.pos_drop3 = Dropout(p=drop_rate) self.patch_embed4 = PatchEmbed(img_size // (2 ** (3 + 1)), patch_size=2, @@ -245,7 +246,7 @@ def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, emb embed_dim=embed_dims[3]) num_patches = self.patch_embed4.num_patches + 1 self.pos_embed4 = mindspore.Parameter(ops.zeros((1, num_patches, embed_dims[3]), mindspore.float16)) - self.pos_drop4 = nn.Dropout(1 - drop_rate) + self.pos_drop4 = Dropout(p=drop_rate) self.Blocks = nn.CellList(b_list) self.norm = norm_layer([embed_dims[3]]) @@ -290,10 +291,9 @@ def _get_pos_embed(self, pos_embed, ph, pw, H, W): if H * W == self.patch_embed1.num_patches: return pos_embed else: - ResizeBilinear = nn.ResizeBilinear() - pos_embed = self.transpose(self.reshape(pos_embed, (1, ph, pw, -1)), (0, 3, 1, 2)) - pos_embed = ResizeBilinear(pos_embed, (H, W)) + resize_bilinear = ops.ResizeBilinear((H, W)) + pos_embed = resize_bilinear(pos_embed) pos_embed = self.transpose(self.reshape(pos_embed, (1, -1, H * W)), (0, 2, 1)) diff --git a/mindcv/models/pvtv2.py b/mindcv/models/pvtv2.py index defac627..dd077b74 100644 --- a/mindcv/models/pvtv2.py +++ b/mindcv/models/pvtv2.py @@ -13,6 +13,7 @@ from .helpers import load_pretrained from .layers import DropPath, Identity +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -73,7 +74,7 @@ def __init__(self, in_features, hidden_features=None, out_features=None, act_lay self.dwconv = DWConv(hidden_features) self.act = act_layer() self.fc2 = nn.Dense(hidden_features, out_features) - self.drop = nn.Dropout(1 - drop) + self.drop = Dropout(p=drop) self.linear = linear if self.linear: self.relu = nn.ReLU() @@ -105,9 +106,9 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0. self.q = nn.Dense(dim, dim, has_bias=qkv_bias) self.kv = nn.Dense(dim, dim * 2, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(1 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(dim, dim) - self.proj_drop = nn.Dropout(1 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) self.qk_batmatmul = ops.BatchMatMul(transpose_b=True) self.batmatmul = ops.BatchMatMul() self.softmax = nn.Softmax(axis=-1) diff --git a/mindcv/models/res2net.py b/mindcv/models/res2net.py index 63677e65..54a9990b 100644 --- a/mindcv/models/res2net.py +++ b/mindcv/models/res2net.py @@ -10,6 +10,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Split from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -93,6 +94,7 @@ def __init__( self.stype = stype self.scale = scale self.width = width + self.split = Split(split_size_or_sections=self.width, output_num=self.scale, axis=1) def construct(self, x: Tensor) -> Tensor: identity = x @@ -101,7 +103,7 @@ def construct(self, x: Tensor) -> Tensor: out = self.bn1(out) out = self.relu(out) - spx = ops.split(out, axis=1, output_num=self.scale) + spx = self.split(out) sp = self.convs[0](spx[0]) sp = self.bns[0](sp) diff --git a/mindcv/models/resnest.py b/mindcv/models/resnest.py index c7ed9131..6a28e30e 100644 --- a/mindcv/models/resnest.py +++ b/mindcv/models/resnest.py @@ -9,6 +9,7 @@ from mindspore import Tensor, nn, ops from .helpers import build_model_with_cfg, make_divisible +from .layers.compatibility import Dropout from .layers.identity import Identity from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -319,7 +320,7 @@ def __init__( self.feature_info.append(dict(chs=block.expansion * 512, reduction=32, name='layer4')) self.avgpool = GlobalAvgPooling() - self.drop = nn.Dropout(keep_prob=1.0 - drop_rate) if drop_rate > 0.0 else None + self.drop = Dropout(p=drop_rate) if drop_rate > 0.0 else None self.fc = nn.Dense(512 * block.expansion, num_classes) self._initialize_weights() diff --git a/mindcv/models/rexnet.py b/mindcv/models/rexnet.py index 3ad1cbb1..45fe4092 100644 --- a/mindcv/models/rexnet.py +++ b/mindcv/models/rexnet.py @@ -11,6 +11,7 @@ from .helpers import build_model_with_cfg, make_divisible from .layers import Conv2dNormActivation, DropPath, GlobalAvgPooling, SqueezeExcite +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -212,11 +213,11 @@ def __init__( self.features = nn.SequentialCell(*features) if self.useconv: self.cls = nn.SequentialCell( - nn.Dropout(1.0 - drop_rate), + Dropout(p=drop_rate), nn.Conv2d(pen_channels, num_classes, 1, has_bias=True)) else: self.cls = nn.SequentialCell( - nn.Dropout(1.0 - drop_rate), + Dropout(p=drop_rate), nn.Dense(pen_channels, num_classes)) self._initialize_weights() diff --git a/mindcv/models/senet.py b/mindcv/models/senet.py index 24e5f2f2..b35030ad 100644 --- a/mindcv/models/senet.py +++ b/mindcv/models/senet.py @@ -10,6 +10,7 @@ from mindspore import Tensor, nn from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .layers.squeeze_excite import SqueezeExciteV2 from .registry import register_model @@ -310,7 +311,7 @@ def __init__( self.pool = GlobalAvgPooling() if self.drop_rate > 0.: - self.dropout = nn.Dropout(keep_prob=1. - self.drop_rate) + self.dropout = Dropout(p=self.drop_rate) self.classifier = nn.Dense(self.num_features, self.num_classes) self._initialize_weights() diff --git a/mindcv/models/squeezenet.py b/mindcv/models/squeezenet.py index b7056578..48fc3d11 100644 --- a/mindcv/models/squeezenet.py +++ b/mindcv/models/squeezenet.py @@ -7,6 +7,7 @@ from mindspore import Tensor, nn, ops from .helpers import load_pretrained +from .layers.compatibility import Dropout from .layers.pooling import GlobalAvgPooling from .registry import register_model @@ -117,7 +118,7 @@ def __init__( self.final_conv = nn.Conv2d(512, num_classes, kernel_size=1, has_bias=True) self.classifier = nn.SequentialCell([ - nn.Dropout(keep_prob=1 - drop_rate), + Dropout(p=drop_rate), self.final_conv, nn.ReLU(), GlobalAvgPooling() diff --git a/mindcv/models/swin_transformer.py b/mindcv/models/swin_transformer.py index 69c11a9c..237f1c7e 100644 --- a/mindcv/models/swin_transformer.py +++ b/mindcv/models/swin_transformer.py @@ -10,6 +10,7 @@ from .helpers import _ntuple, load_pretrained from .layers import DropPath, Identity +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -50,7 +51,7 @@ def __init__( self.fc1 = nn.Dense(in_channels=in_features, out_channels=hidden_features, has_bias=True) self.act = act_layer() self.fc2 = nn.Dense(in_channels=hidden_features, out_channels=out_features, has_bias=True) - self.drop = nn.Dropout(keep_prob=1.0 - drop) + self.drop = Dropout(p=drop) def construct(self, x: Tensor) -> Tensor: x = self.fc1(x) @@ -150,9 +151,11 @@ def __init__( self.relative_position_bias_table = Parameter( Tensor(np.random.randn((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads), dtype=mstype.float32)) # 2*Wh-1 * 2*Ww-1, nH - self.one_hot = nn.OneHot(axis=-1, depth=(2 * window_size[0] - 1) * (2 * window_size[1] - 1), - dtype=mstype.float32) - self.index = Parameter(self.one_hot(self.relative_position_index), requires_grad=False) + self.one_hot = ops.OneHot() + self.index = Parameter(self.one_hot(self.relative_position_index, + (2 * window_size[0] - 1) * (2 * window_size[1] - 1), + Tensor(1.0), Tensor(0.0)), + requires_grad=False) def construct(self) -> Tensor: out = ops.matmul(self.index, self.relative_position_bias_table) @@ -202,9 +205,9 @@ def __init__( self.k = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) self.v = nn.Dense(in_channels=dim, out_channels=dim, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(keep_prob=1.0 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(in_channels=dim, out_channels=dim, has_bias=True) - self.proj_drop = nn.Dropout(keep_prob=1.0 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) self.softmax = nn.Softmax(axis=-1) self.batch_matmul = ops.BatchMatMul() @@ -627,7 +630,7 @@ def __init__( if self.ape: self.absolute_pos_embed = Parameter(Tensor(np.zeros(1, num_patches, embed_dim), dtype=mstype.float32)) - self.pos_drop = nn.Dropout(keep_prob=1.0 - drop_rate) + self.pos_drop = Dropout(p=drop_rate) # stochastic depth dpr = [x for x in np.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule diff --git a/mindcv/models/vgg.py b/mindcv/models/vgg.py index 3f08ddce..8c37d859 100644 --- a/mindcv/models/vgg.py +++ b/mindcv/models/vgg.py @@ -10,6 +10,7 @@ from mindspore import Tensor, nn from .helpers import load_pretrained +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -95,10 +96,10 @@ def __init__( self.classifier = nn.SequentialCell([ nn.Dense(512 * 7 * 7, 4096), nn.ReLU(), - nn.Dropout(keep_prob=1 - drop_rate), + Dropout(p=drop_rate), nn.Dense(4096, 4096), nn.ReLU(), - nn.Dropout(keep_prob=1 - drop_rate), + Dropout(p=drop_rate), nn.Dense(4096, num_classes), ]) self._initialize_weights() diff --git a/mindcv/models/visformer.py b/mindcv/models/visformer.py index 97e5798e..1e120d40 100644 --- a/mindcv/models/visformer.py +++ b/mindcv/models/visformer.py @@ -13,6 +13,7 @@ from .helpers import _ntuple, load_pretrained from .layers import DropPath, GlobalAvgPooling, Identity +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -73,7 +74,7 @@ def __init__( hidden_features = in_features * 2 self.hidden_features = hidden_features self.group = group - self.drop = nn.Dropout(1 - drop) + self.drop = Dropout(p=drop) self.conv1 = nn.Conv2d(in_features, hidden_features, 1, 1, pad_mode="pad", padding=0) self.act1 = act_layer() if self.spatial_conv: @@ -118,9 +119,9 @@ def __init__( self.scale = head_dim**qk_scale_factor self.qkv = nn.Conv2d(dim, head_dim * num_heads * 3, 1, 1, pad_mode="pad", padding=0, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(1 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Conv2d(self.head_dim * self.num_heads, dim, 1, 1, pad_mode="pad", padding=0) - self.proj_drop = nn.Dropout(1 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) def construct(self, x: Tensor) -> Tensor: B, C, H, W = x.shape @@ -275,7 +276,7 @@ def __init__( ]) img_size //= 2 - self.pos_drop = nn.Dropout(1 - drop_rate) + self.pos_drop = Dropout(p=drop_rate) # stage0 if depth[0]: self.patch_embed0 = PatchEmbed(img_size=img_size, patch_size=2, in_chans=self.init_channels, diff --git a/mindcv/models/vit.py b/mindcv/models/vit.py index 2d89e21d..ac2c4c4c 100644 --- a/mindcv/models/vit.py +++ b/mindcv/models/vit.py @@ -11,6 +11,7 @@ from mindspore.common.parameter import Parameter from .helpers import ConfigDict, load_pretrained +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -127,9 +128,9 @@ def __init__( self.scale = Tensor(head_dim**-0.5) self.qkv = nn.Dense(dim, dim * 3) - self.attn_drop = nn.Dropout(attention_keep_prob) + self.attn_drop = Dropout(p=1.0-attention_keep_prob) self.out = nn.Dense(dim, dim) - self.out_drop = nn.Dropout(keep_prob) + self.out_drop = Dropout(p=1.0-keep_prob) self.mul = ops.Mul() self.reshape = ops.Reshape() @@ -194,7 +195,7 @@ def __init__( self.dense1 = nn.Dense(in_features, hidden_features) self.activation = activation() self.dense2 = nn.Dense(hidden_features, out_features) - self.dropout = nn.Dropout(keep_prob) + self.dropout = Dropout(p=1.0-keep_prob) def construct(self, x): """Feed Forward construct.""" @@ -364,7 +365,7 @@ def __init__( ) -> None: super().__init__() - self.dropout = nn.Dropout(keep_prob) + self.dropout = Dropout(p=1.0-keep_prob) self.classifier = nn.Dense(input_channel, num_classes, has_bias=has_bias, activation=activation) def construct(self, x): @@ -568,7 +569,7 @@ def __init__( self.mean = ops.ReduceMean(keep_dims=False) self.pool = pool - self.pos_dropout = nn.Dropout(keep_prob) + self.pos_dropout = Dropout(p=1.0-keep_prob) self.norm = norm((embed_dim,)) self.tile = ops.Tile() self.transformer = TransformerEncoder( diff --git a/mindcv/models/xception.py b/mindcv/models/xception.py index d97e4cf3..445cbcde 100644 --- a/mindcv/models/xception.py +++ b/mindcv/models/xception.py @@ -8,6 +8,7 @@ from .helpers import load_pretrained from .layers import GlobalAvgPooling +from .layers.compatibility import Dropout from .registry import register_model __all__ = [ @@ -156,7 +157,7 @@ def __init__( self.bn4 = nn.BatchNorm2d(2048) self.pool = GlobalAvgPooling() - self.dropout = nn.Dropout() + self.dropout = Dropout(p=0.5) self.classifier = nn.Dense(2048, num_classes) self._initialize_weights() diff --git a/mindcv/models/xcit.py b/mindcv/models/xcit.py index e56472a8..a02ca7a6 100644 --- a/mindcv/models/xcit.py +++ b/mindcv/models/xcit.py @@ -13,7 +13,8 @@ from mindspore import nn, numpy, ops from .helpers import _ntuple, load_pretrained -from .layers import DropPath +from .layers.compatibility import Dropout +from .layers.drop_path import DropPath from .layers.mlp import Mlp from .registry import register_model @@ -193,9 +194,9 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0. self.qkv = nn.Dense( in_channels=dim, out_channels=dim * 3, has_bias=qkv_bias) - self.attn_drop = nn.Dropout(keep_prob=1 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.proj = nn.Dense(in_channels=dim, out_channels=dim) - self.proj_drop = nn.Dropout(keep_prob=1 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) self.softmax = nn.Softmax(axis=-1) self.attn_matmul_v = ops.BatchMatMul() @@ -286,10 +287,10 @@ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0. in_channels=dim, out_channels=dim * 3, has_bias=qkv_bias) self.q_matmul_k = ops.BatchMatMul(transpose_b=True) self.softmax = nn.Softmax(axis=-1) - self.attn_drop = nn.Dropout(keep_prob=1.0 - attn_drop) + self.attn_drop = Dropout(p=attn_drop) self.attn_matmul_v = ops.BatchMatMul() self.proj = nn.Dense(in_channels=dim, out_channels=dim) - self.proj_drop = nn.Dropout(keep_prob=1.0 - proj_drop) + self.proj_drop = Dropout(p=proj_drop) def construct(self, x): B, N, C = x.shape @@ -407,7 +408,7 @@ def __init__(self, self.cls_token = Parameter( ops.zeros((1, 1, embed_dim), mstype.float32)) - self.pos_drop = nn.Dropout(keep_prob=1.0 - drop_rate) + self.pos_drop = Dropout(p=drop_rate) dpr = [drop_path_rate for i in range(depth)] self.blocks = nn.CellList([ diff --git a/mindcv/optim/adan.py b/mindcv/optim/adan.py index c16fc971..7494c67b 100644 --- a/mindcv/optim/adan.py +++ b/mindcv/optim/adan.py @@ -2,7 +2,6 @@ import mindspore as ms from mindspore import ops from mindspore.common import dtype as mstype -from mindspore.common.api import ms_function from mindspore.common.tensor import Tensor from mindspore.nn.optim.optimizer import Optimizer, opt_init_args_register @@ -145,7 +144,6 @@ def __init__( self.weight_decay = Tensor(weight_decay, mstype.float32) - @ms_function def construct(self, gradients): params = self._parameters moment1 = self.moment1 diff --git a/mindcv/optim/nadam.py b/mindcv/optim/nadam.py index 42c88046..a257bf8c 100644 --- a/mindcv/optim/nadam.py +++ b/mindcv/optim/nadam.py @@ -3,7 +3,6 @@ import mindspore as ms from mindspore import ops -from mindspore.common.api import ms_function from mindspore.common.initializer import initializer from mindspore.common.parameter import Parameter from mindspore.common.tensor import Tensor @@ -49,7 +48,6 @@ def __init__( self.mu_schedule = Parameter(initializer(1, [1], ms.float32), name="mu_schedule") self.beta2_power = Parameter(initializer(1, [1], ms.float32), name="beta2_power") - @ms_function def construct(self, gradients): lr = self.get_lr() params = self.parameters diff --git a/mindcv/utils/amp.py b/mindcv/utils/amp.py index a4f39737..92ee5177 100644 --- a/mindcv/utils/amp.py +++ b/mindcv/utils/amp.py @@ -1,10 +1,13 @@ """ auto mixed precision related functions """ # from mindspore.amp import LossScaler # this line of code leads to “get rank id error” in modelarts -from mindspore.common.api import ms_class +try: + from mindspore import jit_class +except ImportError: + from mindspore import ms_class as jit_class -@ms_class +@jit_class class LossScaler: r""" Loss scaler abstract class when using mixed precision. diff --git a/mindcv/utils/callbacks.py b/mindcv/utils/callbacks.py index 6e1145bf..b99ebf24 100644 --- a/mindcv/utils/callbacks.py +++ b/mindcv/utils/callbacks.py @@ -269,9 +269,16 @@ def _get_optimizer_from_cbp(self, cb_params): def _get_lr_from_cbp(self, cb_params): optimizer = self._get_optimizer_from_cbp(cb_params) - step = optimizer.global_step + if optimizer.global_step < 1: + _logger.warning( + "`global_step` of optimizer is less than 1. It seems to be a overflow at the first step. " + "If you keep seeing this message, it means that the optimizer never actually called." + ) + optim_step = Tensor((0,), ms.int32) + else: # if the optimizer is successfully called, the global_step will actually be the value of next step. + optim_step = optimizer.global_step - 1 if optimizer.dynamic_lr: - lr = optimizer.learning_rate(step - 1)[0] + lr = optimizer.learning_rate(optim_step)[0] else: lr = optimizer.learning_rate return lr diff --git a/tests/README.md b/tests/README.md index d9964211..8c943e87 100644 --- a/tests/README.md +++ b/tests/README.md @@ -7,7 +7,7 @@ pytest tests/modules/*.py - `tasks` for system test (ST): test the training and validation pipeline. -To test the training process (in graph mode and pynative+ms_function mode) and the validation process, run +To test the training process (in graph mode and pynative+mindspore.jit mode) and the validation process, run ```shell pytest tests/tasks/test_train_val_imagenet_subset.py ``` diff --git a/tests/tasks/non_cpu/test_train_val_imagenet_subset.py b/tests/tasks/non_cpu/test_train_val_imagenet_subset.py index 6ef086f1..1d3db4d5 100644 --- a/tests/tasks/non_cpu/test_train_val_imagenet_subset.py +++ b/tests/tasks/non_cpu/test_train_val_imagenet_subset.py @@ -1,6 +1,6 @@ """ Test train and validate pipelines. -For training, both graph mode and pynative mode with ms_function will be tested. +For training, both graph mode and pynative mode with mindspore.jit will be tested. """ import os import subprocess diff --git a/tests/tasks/test_train_val_imagenet_subset.py b/tests/tasks/test_train_val_imagenet_subset.py index 85a7e184..2fd677e4 100644 --- a/tests/tasks/test_train_val_imagenet_subset.py +++ b/tests/tasks/test_train_val_imagenet_subset.py @@ -1,6 +1,6 @@ """ Test train and validate pipelines. -For training, both graph mode and pynative mode with ms_function will be tested. +For training, both graph mode and pynative mode with mindspore.jit will be tested. """ import os import subprocess diff --git a/train_with_func.py b/train_with_func.py index 61a691f1..cfae385e 100644 --- a/train_with_func.py +++ b/train_with_func.py @@ -21,6 +21,11 @@ from config import parse_args # isort: skip +try: + from mindspore import jit +except ImportError: + from mindspore import ms_function as jit + logger = logging.getLogger("train") logger.setLevel(logging.INFO) h1 = logging.StreamHandler() @@ -366,7 +371,7 @@ def forward_fn(data, label): grad_reducer = ops.functional.identity # Define function of one-step training - @ms.ms_function + @jit def train_step(data, label): (loss, logits), grads = grad_fn(data, label) grads = grad_reducer(grads)