From 224588e02b05d6b11c9a553fda3d548eadbdff1b Mon Sep 17 00:00:00 2001
From: osmr <osemery@gmail.com>
Date: Fri, 17 Aug 2018 02:51:21 +0300
Subject: [PATCH] After testing the release

---
 README.md                       |   2 +-
 gluon/models/model_store.py     |   1 -
 pytorch/models/model_store.py   |   1 -
 pytorch/models/others/MENet.py  | 227 --------------------
 pytorch/models/others/layers.py | 259 -----------------------
 pytorch/models/others/slim.py   | 360 --------------------------------
 6 files changed, 1 insertion(+), 849 deletions(-)
 delete mode 100644 pytorch/models/others/MENet.py
 delete mode 100644 pytorch/models/others/layers.py
 delete mode 100644 pytorch/models/others/slim.py

diff --git a/README.md b/README.md
index 273fe59fe..5ff0f5086 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ torchvision >= 0.2.1
 - ResNet (['Deep Residual Learning for Image Recognition'](https://arxiv.org/abs/1512.03385))
 - PreResNet (['Identity Mappings in Deep Residual Networks'](https://arxiv.org/abs/1603.05027))
 - DenseNet (['Densely Connected Convolutional Networks'](https://arxiv.org/abs/1608.06993))
-- CondenseNet (['Condense````Net: An Efficient DenseNet using Learned Group Convolutions'](https://arxiv.org/abs/1711.09224))
+- CondenseNet (['CondenseNet: An Efficient DenseNet using Learned Group Convolutions'](https://arxiv.org/abs/1711.09224))
 - DarkNet (['Darknet: Open source neural networks in c'](https://github.com/pjreddie/darknet)) 
 - SqueezeNet (['SqueezeNet: AlexNet-level accuracy with 50x fewer parameters and <0.5MB model size'](https://arxiv.org/abs/1602.07360))
 - SqueezeNext (['SqueezeNext: Hardware-Aware Neural Network Design'](https://arxiv.org/abs/1803.10615))
diff --git a/gluon/models/model_store.py b/gluon/models/model_store.py
index 59f04c21f..f7d1b08ea 100644
--- a/gluon/models/model_store.py
+++ b/gluon/models/model_store.py
@@ -47,7 +47,6 @@
     ('menet348_12x1_g3', '1141', 'ac69b246629131d77bf5a0a454bda28f5c2e6bc0', 'v0.0.6'),
     ('menet352_12x1_g8', '1375', '85779b8a576540ec1082a433bd5ea1ab93def27a', 'v0.0.6'),
     ('menet456_24x1_g3', '1043', '6e777068761f9c45cd0527f3824ad3b5cf36b0b5', 'v0.0.6'),
-
     ('mobilenet_wd4', '2410', 'db312a26033119ad1601fe0300e7c52a11cba93c', 'v0.0.7'),
     ('mobilenet_wd2', '1537', '5419ccc26dedfbb7242e2f4f7c52b13f94812099', 'v0.0.7'),
     ('mobilenet_w3d4', '1228', 'dc11727a3917f2c795c9f286ad9cf299a165fe85', 'v0.0.7'),
diff --git a/pytorch/models/model_store.py b/pytorch/models/model_store.py
index 0a0957d48..522b07dea 100644
--- a/pytorch/models/model_store.py
+++ b/pytorch/models/model_store.py
@@ -48,7 +48,6 @@
     ('menet348_12x1_g3', '1092', '66be1a1896fa0bea27290580e8b98057dfdbda2c', 'v0.0.6'),
     ('menet352_12x1_g8', '1308', 'e91ec72ce2d0c3c2bf2a3cba6719c6b23ea7c736', 'v0.0.6'),
     ('menet456_24x1_g3', '0993', 'cb9fd37660b6064f44a6c779a330a967b2b41c2d', 'v0.0.6'),
-
     ('mobilenet_wd4', '2493', 'c05b5fab876300552b1c9b58d82ff98eb755c15b', 'v0.0.7'),
     ('mobilenet_wd2', '1599', '5883b38d611897bf4b1b49d9eeded2d1868c5c0a', 'v0.0.7'),
     ('mobilenet_w3d4', '1285', 'b8022faebe280b6e6571bec3a4bb6e293895a72d', 'v0.0.7'),
diff --git a/pytorch/models/others/MENet.py b/pytorch/models/others/MENet.py
deleted file mode 100644
index b2b2f96ac..000000000
--- a/pytorch/models/others/MENet.py
+++ /dev/null
@@ -1,227 +0,0 @@
-'''
-Merging-and-Evolution Network
-'''
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from collections import OrderedDict
-from .common import channel_shuffle
-
-__all__ = [
-    'menet',
-    'oth_menet108_8x1_g3',
-    'oth_menet128_8x1_g4',
-    'oth_menet160_8x1_g8',
-    'oth_menet228_12x1_g3',
-    'oth_menet256_12x1_g4',
-    'oth_menet348_12x1_g3',
-    'oth_menet352_12x1_g8',
-    'oth_menet456_24x1_g3',
-]
-
-
-def depthwise_conv(c, stride):
-    return nn.Conv2d(c, c, 3, stride=stride, padding=1, groups=c, bias=False)
-
-
-def group_conv(in_c, out_c, groups):
-    return nn.Conv2d(in_c, out_c, 1, groups=groups, bias=False)
-
-
-def conv1x1(in_c, out_c):
-    return nn.Conv2d(in_c, out_c, 1, bias=False)
-
-
-def conv3x3(in_c, out_c, stride):
-    return nn.Conv2d(in_c, out_c, 3, stride=stride, padding=1, bias=False)
-
-
-class _MEModule(nn.Module):
-    def __init__(self, in_c, out_c, side_c, downsample, groups, ignore_group):
-        super(_MEModule, self).__init__()
-        bott = out_c // 4
-        self.downsample = downsample
-        self.groups = groups
-        if downsample:
-            out_c -= in_c
-            # residual branch
-            if ignore_group:
-                self.compress = group_conv(in_c, bott, 1)
-            else:
-                self.compress = group_conv(in_c, bott, groups)
-            self.bn_compress = nn.BatchNorm2d(bott)
-            self.depthwise = depthwise_conv(bott, 2)
-            self.bn_depthwise = nn.BatchNorm2d(bott)
-            self.expand = group_conv(bott, out_c, groups)
-            self.bn_expand = nn.BatchNorm2d(out_c)
-            self.pool = nn.AvgPool2d(3, stride=2, padding=1)
-            # fusion branch
-            self.s_merge = conv1x1(bott, side_c)
-            self.s_bn_merge = nn.BatchNorm2d(side_c)
-            self.s_conv = conv3x3(side_c, side_c, 2)
-            self.s_bn_conv = nn.BatchNorm2d(side_c)
-            self.s_evolve = conv1x1(side_c, bott)
-            self.s_bn_evolve = nn.BatchNorm2d(bott)
-        else:
-            # residual branch
-            self.compress = group_conv(in_c, bott, groups)
-            self.bn_compress = nn.BatchNorm2d(bott)
-            self.depthwise = depthwise_conv(bott, 1)
-            self.bn_depthwise = nn.BatchNorm2d(bott)
-            self.expand = group_conv(bott, out_c, groups)
-            self.bn_expand = nn.BatchNorm2d(out_c)
-            # fusion branch
-            self.s_merge = conv1x1(bott, side_c)
-            self.s_bn_merge = nn.BatchNorm2d(side_c)
-            self.s_conv = conv3x3(side_c, side_c, 1)
-            self.s_bn_conv = nn.BatchNorm2d(side_c)
-            self.s_evolve = conv1x1(side_c, bott)
-            self.s_bn_evolve = nn.BatchNorm2d(bott)
-
-    def forward(self, x):
-        identity = x
-        # pointwise group convolution 1
-        x = self.compress(x)
-        x = self.bn_compress(x)
-        x = F.relu(x, inplace=True)
-        x = channel_shuffle(x, self.groups)
-        # merging
-        y = self.s_merge(x)
-        y = self.s_bn_merge(y)
-        y = F.relu(y, inplace=True)
-        # depthwise convolution (bottleneck)
-        x = self.depthwise(x)
-        x = self.bn_depthwise(x)
-        # evolution
-        y = self.s_conv(y)
-        y = self.s_bn_conv(y)
-        y = F.relu(y, inplace=True)
-        y = self.s_evolve(y)
-        y = self.s_bn_evolve(y)
-        y = F.sigmoid(y)
-        x *= y
-        # pointwise group convolution 2
-        x = self.expand(x)
-        x = self.bn_expand(x)
-        # identity branch
-        if self.downsample:
-            identity = self.pool(identity)
-            x = torch.cat((x, identity), dim=1)
-        else:
-            x += identity
-        x = F.relu(x, inplace=True)
-        return x
-
-
-class _InitBlock(nn.Module):
-    def __init__(self, init_c):
-        super(_InitBlock, self).__init__()
-        self.conv = conv3x3(3, init_c, 2)
-        self.bn = nn.BatchNorm2d(init_c)
-        self.pool = nn.MaxPool2d(3, stride=2, padding=1)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        x = F.relu(x, inplace=True)
-        x = self.pool(x)
-        return x
-
-
-class MENet(nn.Module):
-    def __init__(self, block_channels, block_layers, init_c, side_channels, groups):
-        super(MENet, self).__init__()
-        self.features = nn.Sequential(OrderedDict([
-            ('init', _InitBlock(init_c)),
-        ]))
-        in_c = init_c
-        for i, (out_c, num_layers, side_c) in enumerate(zip(block_channels, block_layers, side_channels)):
-            self.features.add_module(
-                'stage_{}_{}'.format(i + 1, 1),
-                _MEModule(in_c, out_c, side_c, True, groups, (i == 0))
-            )
-            for _ in range(num_layers):
-                self.features.add_module(
-                    'stage_{}_{}'.format(i + 1, _ + 2),
-                    _MEModule(out_c, out_c, side_c, False, groups, False)
-                )
-            in_c = out_c
-        self.pool = nn.AvgPool2d(7)
-        self.classifier = nn.Linear(in_c, 1000)
-
-    def forward(self, x):
-        x = self.features(x)
-        x = self.pool(x)
-        x = x.view(x.size(0), -1)
-        x = self.classifier(x)
-        #print(tuple(x.size()))
-        return x
-
-
-def menet(model_config):
-    block_channels = model_config['block_channels']
-    block_layers = model_config['block_layers']
-    init_c = model_config['init_c']
-    side_channels = model_config['side_channels']
-    groups = model_config['groups']
-    return MENet(block_channels, block_layers, init_c, side_channels, groups)
-
-
-def oth_menet108_8x1_g3(**kwargs):
-    return menet({"block_channels": [108, 216, 432], "block_layers": [3, 7, 3], "init_c": 12,
-                  "side_channels": [8, 8, 8], "groups": 3})
-
-
-def oth_menet128_8x1_g4(**kwargs):
-    return menet({"block_channels": [128, 256, 512], "block_layers": [3, 7, 3], "init_c": 12,
-                  "side_channels": [8, 8, 8], "groups": 4})
-
-
-def oth_menet160_8x1_g8(**kwargs):
-    return menet({"block_channels": [160, 320, 640], "block_layers": [3, 7, 3], "init_c": 16,
-                  "side_channels": [8, 8, 8], "groups": 8})
-
-
-def oth_menet228_12x1_g3(**kwargs):
-    return menet({"block_channels": [228, 456, 912], "block_layers": [3, 7, 3], "init_c": 24,
-                  "side_channels": [12, 12, 12], "groups": 3})
-
-
-def oth_menet256_12x1_g4(**kwargs):
-    return menet({"block_channels": [256, 512, 1024], "block_layers": [3, 7, 3], "init_c": 24,
-                  "side_channels": [12, 12, 12], "groups": 4})
-
-
-def oth_menet348_12x1_g3(**kwargs):
-    return menet({"block_channels": [348, 696, 1392], "block_layers": [3, 7, 3], "init_c": 24,
-                  "side_channels": [12, 12, 12], "groups": 3})
-
-
-def oth_menet352_12x1_g8(**kwargs):
-    return menet({"block_channels": [352, 704, 1408], "block_layers": [3, 7, 3], "init_c": 24,
-                  "side_channels": [12, 12, 12], "groups": 8})
-
-
-def oth_menet456_24x1_g3(**kwargs):
-    return menet({"block_channels": [456, 912, 1824], "block_layers": [3, 7, 3], "init_c": 48,
-                  "side_channels": [24, 24, 24], "groups": 3})
-
-
-if __name__ == "__main__":
-    import numpy as np
-    import torch
-    from torch.autograd import Variable
-
-    net = oth_menet456_24x1_g3(num_classes=1000)
-
-    input = Variable(torch.randn(1, 3, 224, 224))
-    output = net(input)
-    #print(output.size())
-    #print("net={}".format(net))
-
-    net.eval()
-    net_params = filter(lambda p: p.requires_grad, net.parameters())
-    weight_count = 0
-    for param in net_params:
-        weight_count += np.prod(param.size())
-    print("weight_count={}".format(weight_count))
diff --git a/pytorch/models/others/layers.py b/pytorch/models/others/layers.py
deleted file mode 100644
index e9c0c932b..000000000
--- a/pytorch/models/others/layers.py
+++ /dev/null
@@ -1,259 +0,0 @@
-from __future__ import absolute_import
-from __future__ import unicode_literals
-from __future__ import print_function
-from __future__ import division
-
-import torch
-import torch.nn as nn
-from torch.autograd import Variable
-import torch.nn.functional as F
-
-
-class LearnedGroupConv(nn.Module):
-    global_progress = 0.0
-
-    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
-                 padding=0, dilation=1, groups=1,
-                 condense_factor=None, dropout_rate=0.):
-        super(LearnedGroupConv, self).__init__()
-        self.norm = nn.BatchNorm2d(in_channels)
-        self.relu = nn.ReLU(inplace=True)
-        self.dropout_rate = dropout_rate
-        if self.dropout_rate > 0:
-            self.drop = nn.Dropout(dropout_rate, inplace=False)
-        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride,
-                              padding, dilation, groups=1, bias=False)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.groups = groups
-        self.condense_factor = condense_factor
-        if self.condense_factor is None:
-            self.condense_factor = self.groups
-        ### Parameters that should be carefully used
-        self.register_buffer('_count', torch.zeros(1))
-        self.register_buffer('_stage', torch.zeros(1))
-        self.register_buffer('_mask', torch.ones(self.conv.weight.size()))
-        ### Check if arguments are valid
-        assert self.in_channels % self.groups == 0, "group number can not be divided by input channels"
-        assert self.in_channels % self.condense_factor == 0, "condensation factor can not be divided by input channels"
-        assert self.out_channels % self.groups == 0, "group number can not be divided by output channels"
-
-    def forward(self, x):
-        self._check_drop()
-        x = self.norm(x)
-        x = self.relu(x)
-        if self.dropout_rate > 0:
-            x = self.drop(x)
-        ### Masked output
-        weight = self.conv.weight * self.mask
-        return F.conv2d(x, weight, None, self.conv.stride,
-                        self.conv.padding, self.conv.dilation, 1)
-
-    def _check_drop(self):
-        progress = LearnedGroupConv.global_progress
-        delta = 0
-        ### Get current stage
-        for i in range(self.condense_factor - 1):
-            if progress * 2 < (i + 1) / (self.condense_factor - 1):
-                stage = i
-                break
-        else:
-            stage = self.condense_factor - 1
-        ### Check for dropping
-        if not self._reach_stage(stage):
-            self.stage = stage
-            delta = self.in_channels // self.condense_factor
-        if delta > 0:
-            self._dropping(delta)
-        return
-
-    def _dropping(self, delta):
-        weight = self.conv.weight * self.mask
-        ### Sum up all kernels
-        ### Assume only apply to 1x1 conv to speed up
-        assert weight.size()[-1] == 1
-        weight = weight.abs().squeeze()
-        assert weight.size()[0] == self.out_channels
-        assert weight.size()[1] == self.in_channels
-        d_out = self.out_channels // self.groups
-        ### Shuffle weight
-        weight = weight.view(d_out, self.groups, self.in_channels)
-        weight = weight.transpose(0, 1).contiguous()
-        weight = weight.view(self.out_channels, self.in_channels)
-        ### Sort and drop
-        for i in range(self.groups):
-            wi = weight[i * d_out:(i + 1) * d_out, :]
-            ### Take corresponding delta index
-            di = wi.sum(0).sort()[1][self.count:self.count + delta]
-            for d in di.data:
-                self._mask[i::self.groups, d, :, :].fill_(0)
-        self.count = self.count + delta
-
-    @property
-    def count(self):
-        return int(self._count[0])
-
-    @count.setter
-    def count(self, val):
-        self._count.fill_(val)
-
-    @property
-    def stage(self):
-        return int(self._stage[0])
-
-    @stage.setter
-    def stage(self, val):
-        self._stage.fill_(val)
-
-    @property
-    def mask(self):
-        return Variable(self._mask)
-
-    def _reach_stage(self, stage):
-        return (self._stage >= stage).all()
-
-    @property
-    def lasso_loss(self):
-        if self._reach_stage(self.groups - 1):
-            return 0
-        weight = self.conv.weight * self.mask
-        ### Assume only apply to 1x1 conv to speed up
-        assert weight.size()[-1] == 1
-        weight = weight.squeeze().pow(2)
-        d_out = self.out_channels // self.groups
-        ### Shuffle weight
-        weight = weight.view(d_out, self.groups, self.in_channels)
-        weight = weight.sum(0).clamp(min=1e-6).sqrt()
-        return weight.sum()
-
-
-def ShuffleLayer(x, groups):
-    batchsize, num_channels, height, width = x.data.size()
-    channels_per_group = num_channels // groups
-    ### reshape
-    x = x.view(batchsize, groups,
-               channels_per_group, height, width)
-    ### transpose
-    x = torch.transpose(x, 1, 2).contiguous()
-    ### flatten
-    x = x.view(batchsize, -1, height, width)
-    return x
-
-
-class CondensingLinear(nn.Module):
-    def __init__(self, model, drop_rate=0.5):
-        super(CondensingLinear, self).__init__()
-        self.in_features = int(model.in_features * drop_rate)
-        self.out_features = model.out_features
-        self.linear = nn.Linear(self.in_features, self.out_features)
-        self.register_buffer('index', torch.LongTensor(self.in_features))
-        _, index = model.weight.data.abs().sum(0).sort()
-        index = index[model.in_features - self.in_features:]
-        self.linear.bias.data = model.bias.data.clone()
-        for i in range(self.in_features):
-            self.index[i] = index[i]
-            self.linear.weight.data[:, i] = model.weight.data[:, index[i]]
-
-    def forward(self, x):
-        x = torch.index_select(x, 1, Variable(self.index))
-        x = self.linear(x)
-        return x
-
-
-class CondensingConv(nn.Module):
-    def __init__(self, model):
-        super(CondensingConv, self).__init__()
-        self.in_channels = model.conv.in_channels \
-                           * model.groups // model.condense_factor
-        self.out_channels = model.conv.out_channels
-        self.groups = model.groups
-        self.condense_factor = model.condense_factor
-        self.norm = nn.BatchNorm2d(self.in_channels)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv = nn.Conv2d(self.in_channels, self.out_channels,
-                              kernel_size=model.conv.kernel_size,
-                              padding=model.conv.padding,
-                              groups=self.groups,
-                              bias=False,
-                              stride=model.conv.stride)
-        self.register_buffer('index', torch.LongTensor(self.in_channels))
-        index = 0
-        mask = model._mask.mean(-1).mean(-1)
-        for i in range(self.groups):
-            for j in range(model.conv.in_channels):
-                if index < (self.in_channels // self.groups) * (i + 1) \
-                        and mask[i, j] == 1:
-                    for k in range(self.out_channels // self.groups):
-                        idx_i = int(k + i * (self.out_channels // self.groups))
-                        idx_j = index % (self.in_channels // self.groups)
-                        self.conv.weight.data[idx_i, idx_j, :, :] = \
-                            model.conv.weight.data[int(i + k * self.groups), j, :, :]
-                        self.norm.weight.data[index] = model.norm.weight.data[j]
-                        self.norm.bias.data[index] = model.norm.bias.data[j]
-                        self.norm.running_mean[index] = model.norm.running_mean[j]
-                        self.norm.running_var[index] = model.norm.running_var[j]
-                    self.index[index] = j
-                    index += 1
-
-    def forward(self, x):
-        x = torch.index_select(x, 1, Variable(self.index))
-        x = self.norm(x)
-        x = self.relu(x)
-        x = self.conv(x)
-        x = ShuffleLayer(x, self.groups)
-        return x
-
-
-class CondenseLinear(nn.Module):
-    def __init__(self, in_features, out_features, drop_rate=0.5):
-        super(CondenseLinear, self).__init__()
-        self.in_features = int(in_features * drop_rate)
-        self.out_features = out_features
-        self.linear = nn.Linear(self.in_features, self.out_features)
-        self.register_buffer('index', torch.LongTensor(self.in_features))
-
-    def forward(self, x):
-        x = torch.index_select(x, 1, Variable(self.index))
-        x = self.linear(x)
-        return x
-
-
-class CondenseConv(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size,
-                 stride=1, padding=0, groups=1):
-        super(CondenseConv, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.groups = groups
-        self.norm = nn.BatchNorm2d(self.in_channels)
-        self.relu = nn.ReLU(inplace=True)
-        self.conv = nn.Conv2d(self.in_channels, self.out_channels,
-                              kernel_size=kernel_size,
-                              stride=stride,
-                              padding=padding,
-                              groups=self.groups,
-                              bias=False)
-        self.register_buffer('index', torch.LongTensor(self.in_channels))
-        self.index.fill_(0)
-
-    def forward(self, x):
-        x = torch.index_select(x, 1, Variable(self.index))
-        x = self.norm(x)
-        x = self.relu(x)
-        x = self.conv(x)
-        x = ShuffleLayer(x, self.groups)
-        return x
-
-
-class Conv(nn.Sequential):
-    def __init__(self, in_channels, out_channels, kernel_size,
-                 stride=1, padding=0, groups=1):
-        super(Conv, self).__init__()
-        self.add_module('norm', nn.BatchNorm2d(in_channels))
-        self.add_module('relu', nn.ReLU(inplace=True))
-        self.add_module('conv', nn.Conv2d(in_channels, out_channels,
-                                          kernel_size=kernel_size,
-                                          stride=stride,
-                                          padding=padding, bias=False,
-                                          groups=groups))
-
diff --git a/pytorch/models/others/slim.py b/pytorch/models/others/slim.py
deleted file mode 100644
index 0339cfe57..000000000
--- a/pytorch/models/others/slim.py
+++ /dev/null
@@ -1,360 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import torch
-import torch.nn as nn
-
-try:
-    import caffe
-    from caffe import layers as L
-    from caffe import params as P
-except ImportError:
-    pass
-
-
-def g_name(g_name, m):
-    m.g_name = g_name
-    return m
-
-
-class ChannelShuffle(nn.Module):
-    def __init__(self, groups):
-        super(ChannelShuffle, self).__init__()
-        self.groups = groups
-
-    def forward(self, x):
-        x = x.reshape(x.shape[0], self.groups, x.shape[1] // self.groups, x.shape[2], x.shape[3])
-        x = x.permute(0, 2, 1, 3, 4)
-        x = x.reshape(x.shape[0], -1, x.shape[3], x.shape[4])
-        return x
-
-    def generate_caffe_prototxt(self, caffe_net, layer):
-        layer = L.ShuffleChannel(layer, group=self.groups)
-        caffe_net[self.g_name] = layer
-        return layer
-
-
-def channel_shuffle(name, groups):
-    return g_name(name, ChannelShuffle(groups))
-
-
-class Permute(nn.Module):
-    def __init__(self, order):
-        super(Permute, self).__init__()
-        self.order = order
-
-    def forward(self, x):
-        x = x.permute(*self.order).contiguous()
-        return x
-
-    def generate_caffe_prototxt(self, caffe_net, layer):
-        layer = L.Permute(layer, order=list(self.order))
-        caffe_net[self.g_name] = layer
-        return layer
-
-
-def permute(name, order):
-    return g_name(name, Permute(order))
-
-
-class Flatten(nn.Module):
-    def __init__(self, axis):
-        super(Flatten, self).__init__()
-        self.axis = axis
-
-    def forward(self, x):
-        assert self.axis == 1
-        x = x.reshape(x.shape[0], -1)
-        return x
-
-    def generate_caffe_prototxt(self, caffe_net, layer):
-        layer = L.Flatten(layer, axis=self.axis)
-        caffe_net[self.g_name] = layer
-        return layer
-
-
-def flatten(name, axis):
-    return g_name(name, Flatten(axis))
-
-
-def generate_caffe_prototxt(m, caffe_net, layer):
-    if hasattr(m, 'generate_caffe_prototxt'):
-        return m.generate_caffe_prototxt(caffe_net, layer)
-
-    if isinstance(m, nn.Sequential):
-        for module in m:
-            layer = generate_caffe_prototxt(module, caffe_net, layer)
-        return layer
-
-    if isinstance(m, nn.Conv2d):
-        if m.bias is None:
-            param=[dict(lr_mult=1, decay_mult=1)]
-        else:
-            param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=1, decay_mult=0)]
-        assert m.dilation[0] == m.dilation[1]
-        convolution_param=dict(
-            num_output=m.out_channels,
-            group=m.groups, bias_term=(m.bias is not None),
-            weight_filler=dict(type='msra'),
-            dilation=m.dilation[0],
-        )
-        if m.kernel_size[0] == m.kernel_size[1]:
-            convolution_param['kernel_size'] = m.kernel_size[0]
-        else:
-            convolution_param['kernel_h'] = m.kernel_size[0]
-            convolution_param['kernel_w'] = m.kernel_size[1]
-        if m.stride[0] == m.stride[1]:
-            convolution_param['stride'] = m.stride[0]
-        else:
-            convolution_param['stride_h'] = m.stride[0]
-            convolution_param['stride_w'] = m.stride[1]
-        if m.padding[0] == m.padding[1]:
-            convolution_param['pad'] = m.padding[0]
-        else:
-            convolution_param['pad_h'] = m.padding[0]
-            convolution_param['pad_w'] = m.padding[1]
-        layer = L.Convolution(
-            layer,
-            param=param,
-            convolution_param=convolution_param,
-        )
-        caffe_net.tops[m.g_name] = layer
-        return layer
-
-    if isinstance(m, nn.ConvTranspose2d):
-        if m.bias is None:
-            param=[dict(lr_mult=1, decay_mult=1)]
-        else:
-            param=[dict(lr_mult=1, decay_mult=1), dict(lr_mult=1, decay_mult=0)]
-        assert m.dilation[0] == m.dilation[1]
-        convolution_param=dict(
-            num_output=m.out_channels,
-            group=m.groups, bias_term=(m.bias is not None),
-            weight_filler=dict(type='msra'),
-            dilation=m.dilation[0],
-        )
-        if m.kernel_size[0] == m.kernel_size[1]:
-            convolution_param['kernel_size'] = m.kernel_size[0]
-        else:
-            convolution_param['kernel_h'] = m.kernel_size[0]
-            convolution_param['kernel_w'] = m.kernel_size[1]
-        if m.stride[0] == m.stride[1]:
-            convolution_param['stride'] = m.stride[0]
-        else:
-            convolution_param['stride_h'] = m.stride[0]
-            convolution_param['stride_w'] = m.stride[1]
-        if m.padding[0] == m.padding[1]:
-            convolution_param['pad'] = m.padding[0]
-        else:
-            convolution_param['pad_h'] = m.padding[0]
-            convolution_param['pad_w'] = m.padding[1]
-        layer = L.Deconvolution(
-            layer,
-            param=param,
-            convolution_param=convolution_param,
-        )
-        caffe_net.tops[m.g_name] = layer
-        return layer
-
-    if isinstance(m, nn.BatchNorm2d):
-        layer = L.BatchNorm(
-            layer, in_place=True,
-            param=[dict(lr_mult=0, decay_mult=0), dict(lr_mult=0, decay_mult=0), dict(lr_mult=0, decay_mult=0)],
-        )
-        caffe_net[m.g_name] = layer
-        if m.affine:
-            layer = L.Scale(
-                layer, in_place=True, bias_term=True,
-                filler=dict(type='constant', value=1), bias_filler=dict(type='constant', value=0),
-                param=[dict(lr_mult=1, decay_mult=0), dict(lr_mult=1, decay_mult=0)],
-            )
-            caffe_net[m.g_name + '/scale'] = layer
-        return layer
-
-    if isinstance(m, nn.ReLU):
-        layer = L.ReLU(layer, in_place=True)
-        caffe_net.tops[m.g_name] = layer
-        return layer
-
-    if isinstance(m, nn.PReLU):
-        layer = L.PReLU(layer)
-        caffe_net.tops[m.g_name] = layer
-        return layer
-
-    if isinstance(m, nn.AvgPool2d) or isinstance(m, nn.MaxPool2d):
-        if isinstance(m, nn.AvgPool2d):
-            pooling_param = dict(pool=P.Pooling.AVE)
-        else:
-            pooling_param = dict(pool=P.Pooling.MAX)
-        if isinstance(m.kernel_size, tuple) or isinstance(m.kernel_size, list):
-            pooling_param['kernel_h'] = m.kernel_size[0]
-            pooling_param['kernel_w'] = m.kernel_size[1]
-        else:
-            pooling_param['kernel_size'] = m.kernel_size
-        if isinstance(m.stride, tuple) or isinstance(m.stride, list):
-            pooling_param['stride_h'] = m.stride[0]
-            pooling_param['stride_w'] = m.stride[1]
-        else:
-            pooling_param['stride'] = m.stride
-        if isinstance(m.padding, tuple) or isinstance(m.padding, list):
-            pooling_param['pad_h'] = m.padding[0]
-            pooling_param['pad_w'] = m.padding[1]
-        else:
-            pooling_param['pad'] = m.padding
-        layer = L.Pooling(layer, pooling_param=pooling_param)
-        caffe_net.tops[m.g_name] = layer
-        return layer
-    raise Exception("Unknow module '%s' to generate caffe prototxt." % m)
-
-
-def convert_pytorch_to_caffe(torch_net, caffe_net):
-    for name, m in torch_net.named_modules():
-        if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
-            print('convert conv:', name, m.g_name, m)
-            caffe_net.params[m.g_name][0].data[...] = m.weight.data.cpu().numpy()
-            if m.bias is not None:
-                caffe_net.params[m.g_name][1].data[...] = m.bias.data.cpu().numpy()
-        if isinstance(m, nn.BatchNorm2d):
-            print('convert bn:', name, m.g_name, m)
-            caffe_net.params[m.g_name][0].data[...] = m.running_mean.cpu().numpy()
-            caffe_net.params[m.g_name][1].data[...] = m.running_var.cpu().numpy()
-            caffe_net.params[m.g_name][2].data[...] = 1
-            if m.affine:
-                caffe_net.params[m.g_name + '/scale'][0].data[...] = m.weight.data.cpu().numpy()
-                caffe_net.params[m.g_name + '/scale'][1].data[...] = m.bias.data.cpu().numpy()
-
-
-def conv_bn_relu(name, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    return nn.Sequential(
-        g_name(name, nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, False)),
-        g_name(name + '/bn', nn.BatchNorm2d(out_channels)),
-        g_name(name + '/relu', nn.ReLU(inplace=True)),
-    )
-
-
-def conv_bn(name, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    return nn.Sequential(
-        g_name(name, nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, False)),
-        g_name(name + '/bn', nn.BatchNorm2d(out_channels)),
-    )
-
-
-def conv(name, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    return g_name(name, nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, True))
-
-
-def conv_relu(name, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    return nn.Sequential(
-        g_name(name, nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, True)),
-        g_name(name + '/relu', nn.ReLU()),
-    )
-
-def conv_prelu(name, in_channels, out_channels, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    return nn.Sequential(
-        g_name(name, nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, True)),
-        g_name(name + '/prelu', nn.PReLU()),
-    )
-    
-
-if __name__ == '__main__':
-
-    class BasicBlock(nn.Module):
-
-        def __init__(self, name, in_channels, middle_channels, out_channels, stride, residual):
-            super(BasicBlock, self).__init__()
-            self.g_name = name
-            self.residual = residual
-            self.conv = [
-                conv_bn(name + '/conv1', 
-                    in_channels, in_channels, 3, stride=stride, padding=1, groups=in_channels),
-                conv_bn_relu(name + '/conv2', in_channels, middle_channels, 1),
-                conv_bn(name + '/conv3', middle_channels, out_channels, 1),
-            ]
-            self.conv = nn.Sequential(*self.conv)
-            # self.relu = g_name(name + '/relu', nn.ReLU(inplace=True))
-
-        def forward(self, x):
-            x = x + self.conv(x) if self.residual else self.conv(x)
-            # x = self.relu(x)
-            return x
-
-        def generate_caffe_prototxt(self, caffe_net, layer):
-            residual_layer = layer
-            layer = generate_caffe_prototxt(self.conv, caffe_net, layer)
-            if self.residual:
-                layer = L.Eltwise(residual_layer, layer, operation=P.Eltwise.SUM)
-                caffe_net[self.g_name + '/sum'] = layer
-            # layer = generate_caffe_prototxt(self.relu, caffe_net, layer)
-            return layer
-
-
-    class Network(nn.Module):
-
-        def __init__(self, num_outputs, width_multiplier=32):
-            super(Network, self).__init__()
-
-            assert width_multiplier >= 0 and width_multiplier <= 256
-            # assert width_multiplier % 2 == 0
-
-            self.network = [
-                g_name('data/bn', nn.BatchNorm2d(3)),
-                conv_bn_relu('stage1/conv', 3, 32, 3, 2, 1),
-                # g_name('stage1/pool', nn.MaxPool2d(3, 2, 0, ceil_mode=True)),
-            ]
-            channel = lambda i: (2**i) * int(width_multiplier)
-            network_parameters = [
-                (32,         channel(2) * 4, channel(2), 2, 2),
-                (channel(2), channel(2) * 4, channel(2), 2, 4),
-                (channel(2), channel(3) * 4, channel(3), 2, 8),
-                (channel(3), channel(4) * 4, channel(4), 2, 4),
-            ]
-            for i, parameters in enumerate(network_parameters):
-                in_channels, middle_channels, out_channels, stride, num_blocks = parameters
-                self.network += [self._generate_stage('stage_{}'.format(i + 2), 
-                    in_channels, middle_channels, out_channels, stride, num_blocks)]
-            self.network += [
-                conv_bn_relu('unsqueeze', out_channels, out_channels * 4, 1),
-                g_name('pool_fc', nn.AvgPool2d(7)),
-                g_name('fc', nn.Conv2d(out_channels * 4, num_outputs, 1)),
-            ]
-            self.network = nn.Sequential(*self.network)
-
-            for name, m in self.named_modules():
-                if any(map(lambda x: isinstance(m, x), [nn.Linear, nn.Conv1d, nn.Conv2d])):
-                    nn.init.kaiming_normal(m.weight, mode='fan_out')
-                    if m.bias is not None:
-                        nn.init.constant(m.bias, 0)
-
-        def _generate_stage(self, name, in_channels, middle_channels, out_channels, stride, num_blocks):
-            blocks = [BasicBlock(name + '_1', in_channels, middle_channels, out_channels, 2, False)]
-            for i in range(1, num_blocks):
-                blocks.append(BasicBlock(name + '_{}'.format(i + 1), 
-                    out_channels, middle_channels, out_channels, 1, True))
-            return nn.Sequential(*blocks)
-
-        def forward(self, x):
-            return self.network(x).view(x.size(0), -1)
-        
-        def generate_caffe_prototxt(self, caffe_net, layer):
-            return generate_caffe_prototxt(self.network, caffe_net, layer)
-
-        def convert_to_caffe(self, name):
-            caffe_net = caffe.NetSpec()
-            layer = L.Input(shape=dict(dim=[1, 3, 224, 224]))
-            caffe_net.tops['data'] = layer
-            generate_caffe_prototxt(self, caffe_net, layer)
-            print(caffe_net.to_proto())
-            with open(name + '.prototxt', 'wb') as f:
-                f.write(str(caffe_net.to_proto()))
-            caffe_net = caffe.Net(name + '.prototxt', caffe.TEST)
-            convert_pytorch_to_caffe(self, caffe_net)
-            caffe_net.save(name + '.caffemodel')
-
-
-    network = Network(1000, 8)
-    print(network)
-    network.convert_to_caffe('net')
\ No newline at end of file