lars_optimizer.py

# coding=utf-8
# Copyright 2020 The SimCLR Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific simclr governing permissions and
# limitations under the License.
# ==============================================================================
"""Functions and classes related to optimization (weight updates)."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import re

import tensorflow.compat.v1 as tf

EETA_DEFAULT = 0.001


class LARSOptimizer(tf.train.Optimizer):
    """Layer-wise Adaptive Rate Scaling for large batch training.

    Introduced by "Large Batch Training of Convolutional Networks" by Y. You,
    I. Gitman, and B. Ginsburg. (https://arxiv.org/abs/1708.03888)
    """

    def __init__(self,
                 learning_rate,
                 momentum=0.9,
                 use_nesterov=False,
                 weight_decay=0.0,
                 exclude_from_weight_decay=None,
                 exclude_from_layer_adaptation=None,
                 classic_momentum=True,
                 eeta=EETA_DEFAULT,
                 name="LARSOptimizer"):
        """Constructs a LARSOptimizer.

        Args:
          learning_rate: A `float` for learning rate.
          momentum: A `float` for momentum.
          use_nesterov: A 'Boolean' for whether to use nesterov momentum.
          weight_decay: A `float` for weight decay.
          exclude_from_weight_decay: A list of `string` for variable screening, if
              any of the string appears in a variable's name, the variable will be
              excluded for computing weight decay. For example, one could specify
              the list like ['batch_normalization', 'bias'] to exclude BN and bias
              from weight decay.
          exclude_from_layer_adaptation: Similar to exclude_from_weight_decay, but
              for layer adaptation. If it is None, it will be defaulted the same as
              exclude_from_weight_decay.
          classic_momentum: A `boolean` for whether to use classic (or popular)
              momentum. The learning rate is applied during momeuntum update in
              classic momentum, but after momentum for popular momentum.
          eeta: A `float` for scaling of learning rate when computing trust ratio.
          name: The name for the scope.
        """
        super(LARSOptimizer, self).__init__(False, name)

        self.learning_rate = learning_rate
        self.momentum = momentum
        self.weight_decay = weight_decay
        self.use_nesterov = use_nesterov
        self.classic_momentum = classic_momentum
        self.eeta = eeta
        self.exclude_from_weight_decay = exclude_from_weight_decay
        # exclude_from_layer_adaptation is set to exclude_from_weight_decay if the
        # arg is None.
        if exclude_from_layer_adaptation:
            self.exclude_from_layer_adaptation = exclude_from_layer_adaptation
        else:
            self.exclude_from_layer_adaptation = exclude_from_weight_decay

    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        if global_step is None:
            global_step = tf.train.get_or_create_global_step()
        new_global_step = global_step + 1

        assignments = []
        for (grad, param) in grads_and_vars:
            if grad is None or param is None:
                continue

            param_name = param.op.name

            v = tf.get_variable(
                name=param_name + "/Momentum",
                shape=param.shape.as_list(),
                dtype=tf.float32,
                trainable=False,
                initializer=tf.zeros_initializer())

            if self._use_weight_decay(param_name):
                grad += self.weight_decay * param

            if self.classic_momentum:
                trust_ratio = 1.0
                if self._do_layer_adaptation(param_name):
                    w_norm = tf.norm(param, ord=2)
                    g_norm = tf.norm(grad, ord=2)
                    trust_ratio = tf.where(
                        tf.greater(w_norm, 0), tf.where(
                            tf.greater(g_norm, 0), (self.eeta *
                                                    w_norm / g_norm),
                            1.0),
                        1.0)
                scaled_lr = self.learning_rate * trust_ratio

                next_v = tf.multiply(self.momentum, v) + scaled_lr * grad
                if self.use_nesterov:
                    update = tf.multiply(
                        self.momentum, next_v) + scaled_lr * grad
                else:
                    update = next_v
                next_param = param - update
            else:
                next_v = tf.multiply(self.momentum, v) + grad
                if self.use_nesterov:
                    update = tf.multiply(self.momentum, next_v) + grad
                else:
                    update = next_v

                trust_ratio = 1.0
                if self._do_layer_adaptation(param_name):
                    w_norm = tf.norm(param, ord=2)
                    v_norm = tf.norm(update, ord=2)
                    trust_ratio = tf.where(
                        tf.greater(w_norm, 0), tf.where(
                            tf.greater(v_norm, 0), (self.eeta *
                                                    w_norm / v_norm),
                            1.0),
                        1.0)
                scaled_lr = trust_ratio * self.learning_rate
                next_param = param - scaled_lr * update

            assignments.extend(
                [param.assign(next_param),
                 v.assign(next_v),
                 global_step.assign(new_global_step)])
        return tf.group(*assignments, name=name)

    def _use_weight_decay(self, param_name):
        """Whether to use L2 weight decay for `param_name`."""
        if not self.weight_decay:
            return False
        if self.exclude_from_weight_decay:
            for r in self.exclude_from_weight_decay:
                if re.search(r, param_name) is not None:
                    return False
        return True

    def _do_layer_adaptation(self, param_name):
        """Whether to do layer-wise learning rate adaptation for `param_name`."""
        if self.exclude_from_layer_adaptation:
            for r in self.exclude_from_layer_adaptation:
                if re.search(r, param_name) is not None:
                    return False
        return True