Source code for xenonpy.model.training.optimizer

#  Copyright (c) 2021. TsumiNa. All rights reserved.
#  Use of this source code is governed by a BSD-style
#  license that can be found in the LICENSE file.

from torch import optim

from xenonpy.model.training.base import BaseOptimizer

__all__ = ['Adadelta', 'Adagrad', 'Adam', 'Adamax', 'ASGD', 'SGD', 'SparseAdam', 'RMSprop', 'Rprop', 'LBFGS']


[docs]class Adadelta(BaseOptimizer):

    def __init__(self, *, lr=1.0, rho=0.9, eps=1e-06, weight_decay=0):
        """Implements Adadelta algorithm.

        It has been proposed in `ADADELTA: An Adaptive Learning Rate Method`__.

        Arguments:
            rho (float, optional): coefficient used for computing a running average
                of squared gradients (default: 0.9)
            eps (float, optional): term added to the denominator to improve
                numerical stability (default: 1e-6)
            lr (float, optional): coefficient that scale delta before it is applied
                to the parameters (default: 1.0)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)

        __ https://arxiv.org/abs/1212.5701
        """
        super().__init__(optim.Adadelta, lr=lr, rho=rho, eps=eps, weight_decay=weight_decay)


[docs]class Adagrad(BaseOptimizer):

    def __init__(self, *, lr=0.01, lr_decay=0, weight_decay=0, initial_accumulator_value=0):
        """Implements Adagrad algorithm.

        It has been proposed in `Adaptive Subgradient Methods for Online Learning
        and Stochastic Optimization`_.

        Arguments:
            lr (float, optional): learning rate (default: 1e-2)
            lr_decay (float, optional): learning rate decay (default: 0)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)

        .. _Adaptive Subgradient Methods for Online Learning and Stochastic
            Optimization: http://jmlr.org/papers/v12/duchi11a.html
        """
        super().__init__(optim.Adagrad,
                         lr=lr,
                         lr_decay=lr_decay,
                         weight_decay=weight_decay,
                         initial_accumulator_value=initial_accumulator_value)


[docs]class Adam(BaseOptimizer):

    def __init__(self, *, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
        r"""Implements Adam algorithm.

        It has been proposed in `Adam: A Method for Stochastic Optimization`_.
        The implementation of the L2 penalty follows changes proposed in `Decoupled Weight Decay Regularization`_.

        Arguments:
            lr (float, optional): learning rate (default: 1e-3)
            betas (Tuple[float, float], optional): coefficients used for computing
                running averages of gradient and its square (default: (0.9, 0.999))
            eps (float, optional): term added to the denominator to improve
                numerical stability (default: 1e-8)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
            amsgrad (boolean, optional): whether to use the AMSGrad variant of this
                algorithm from the paper `On the Convergence of Adam and Beyond`_
                (default: False)

        .. _Adam\: A Method for Stochastic Optimization:
            https://arxiv.org/abs/1412.6980
        .. _Decoupled Weight Decay Regularization:
            https://arxiv.org/abs/1711.05101
        .. _On the Convergence of Adam and Beyond:
            https://openreview.net/forum?id=ryQu7f-RZ
        """

        super().__init__(optim.Adam, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)


class AdamW(BaseOptimizer):

    def __init__(self, *, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False):
        r"""Implements AdamW algorithm.

        The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_.
        The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_.

        Arguments:
            lr (float, optional): learning rate (default: 1e-3)
            betas (Tuple[float, float], optional): coefficients used for computing
                running averages of gradient and its square (default: (0.9, 0.999))
            eps (float, optional): term added to the denominator to improve
                numerical stability (default: 1e-8)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
            amsgrad (boolean, optional): whether to use the AMSGrad variant of this
                algorithm from the paper `On the Convergence of Adam and Beyond`_
                (default: False)

        .. _Adam\: A Method for Stochastic Optimization:
            https://arxiv.org/abs/1412.6980
        .. _Decoupled Weight Decay Regularization:
            https://arxiv.org/abs/1711.05101
        .. _On the Convergence of Adam and Beyond:
            https://openreview.net/forum?id=ryQu7f-RZ
        """

        super().__init__(optim.AdamW, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad)


[docs]class SparseAdam(BaseOptimizer):

    def __init__(self, *, lr=0.001, betas=(0.9, 0.999), eps=1e-08):
        r"""Implements lazy version of Adam algorithm suitable for sparse tensors.

        In this variant, only moments that show up in the gradient get updated, and
        only those portions of the gradient get applied to the parameters.

        Arguments:
            lr (float, optional): learning rate (default: 1e-3)
            betas (Tuple[float, float], optional): coefficients used for computing
                running averages of gradient and its square (default: (0.9, 0.999))
            eps (float, optional): term added to the denominator to improve
                numerical stability (default: 1e-8)

        .. _Adam\: A Method for Stochastic Optimization:
            https://arxiv.org/abs/1412.6980
        """

        super().__init__(optim.SparseAdam, lr=lr, betas=betas, eps=eps)


[docs]class Adamax(BaseOptimizer):

    def __init__(self, *, lr=0.002, betas=(0.9, 0.999), eps=1e-08, weight_decay=0):
        """Implements Adamax algorithm (a variant of Adam based on infinity norm).

        It has been proposed in `Adam: A Method for Stochastic Optimization`__.

        Arguments:
            lr (float, optional): learning rate (default: 2e-3)
            betas (Tuple[float, float], optional): coefficients used for computing
                running averages of gradient and its square
            eps (float, optional): term added to the denominator to improve
                numerical stability (default: 1e-8)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)

        __ https://arxiv.org/abs/1412.6980
        """

        super().__init__(optim.Adamax, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)


[docs]class ASGD(BaseOptimizer):

    def __init__(self, *, lr=0.002, lambd=0.0001, alpha=0.75, t0=1000000.0, weight_decay=0):
        """Implements Averaged Stochastic Gradient Descent.

        It has been proposed in `Acceleration of stochastic approximation by
        averaging`_.

        Arguments:
            lr (float, optional): learning rate (default: 1e-2)
            lambd (float, optional): decay term (default: 1e-4)
            alpha (float, optional): power for eta update (default: 0.75)
            t0 (float, optional): point at which to start averaging (default: 1e6)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)

        .. _Acceleration of stochastic approximation by averaging:
            http://dl.acm.org/citation.cfm?id=131098
        """

        super().__init__(optim.ASGD, lr=lr, lambd=lambd, alpha=alpha, t0=t0, weight_decay=weight_decay)


[docs]class LBFGS(BaseOptimizer):

    def __init__(self,
                 *,
                 lr=1,
                 max_iter=20,
                 max_eval=None,
                 tolerance_grad=1e-5,
                 tolerance_change=1e-9,
                 history_size=100,
                 line_search_fn=None):
        """Implements L-BFGS algorithm.

        .. warning::
            This optimizer doesn't support per-parameter options and parameter
            groups (there can be only one).

        .. warning::
            Right now all parameters have to be on a single device. This will be
            improved in the future.

        .. note::
            This is a very memory intensive optimizer (it requires additional
            ``param_bytes * (history_size + 1)`` bytes). If it doesn't fit in memory
            try reducing the history size, or use a different algorithm.

        Arguments:
            lr (float): learning rate (default: 1)
            max_iter (int): maximal number of iterations per optimization step
                (default: 20)
            max_eval (int): maximal number of function evaluations per optimization
                step (default: max_iter * 1.25).
            tolerance_grad (float): termination tolerance on first order optimality
                (default: 1e-5).
            tolerance_change (float): termination tolerance on function
                value/parameter changes (default: 1e-9).
            history_size (int): update history size (default: 100).
        """

        super().__init__(optim.LBFGS,
                         lr=lr,
                         max_iter=max_iter,
                         max_eval=max_eval,
                         tolerance_grad=tolerance_grad,
                         tolerance_change=tolerance_change,
                         history_size=history_size,
                         line_search_fn=line_search_fn)


[docs]class RMSprop(BaseOptimizer):

    def __init__(self, *, lr=0.01, alpha=0.99, eps=1e-08, weight_decay=0, momentum=0, centered=False):
        """Implements RMSprop algorithm.

        Proposed by G. Hinton in his
        `course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.

        The centered version first appears in `Generating Sequences
        With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.

        Arguments:
            lr (float, optional): learning rate (default: 1e-2)
            momentum (float, optional): momentum factor (default: 0)
            alpha (float, optional): smoothing constant (default: 0.99)
            eps (float, optional): term added to the denominator to improve
                numerical stability (default: 1e-8)
            centered (bool, optional) : if ``True``, compute the centered RMSProp,
                the gradient is normalized by an estimation of its variance
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)

        """

        super().__init__(optim.RMSprop,
                         lr=lr,
                         alpha=alpha,
                         eps=eps,
                         weight_decay=weight_decay,
                         momentum=momentum,
                         centered=centered)


[docs]class Rprop(BaseOptimizer):

    def __init__(self, *, lr=0.01, etas=(0.5, 1.2), step_sizes=(1e-06, 50)):
        """Implements the resilient backpropagation algorithm.

        Arguments:
            lr (float, optional): learning rate (default: 1e-2)
            etas (Tuple[float, float], optional): pair of (etaminus, etaplis), that
                are multiplicative increase and decrease factors
                (default: (0.5, 1.2))
            step_sizes (Tuple[float, float], optional): a pair of minimal and
                maximal allowed step sizes (default: (1e-6, 50))
        """

        super().__init__(optim.Rprop, lr=lr, etas=etas, step_sizes=step_sizes)


[docs]class SGD(BaseOptimizer):

    def __init__(self, *, lr=0.01, momentum=0, dampening=0, weight_decay=0, nesterov=False):
        r"""Implements stochastic gradient descent (optionally with momentum).

        Nesterov momentum is based on the formula from
        `On the importance of initialization and momentum in deep learning`__.

        Args:
            lr (float): learning rate
            momentum (float, optional): momentum factor (default: 0)
            weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
            dampening (float, optional): dampening for momentum (default: 0)
            nesterov (bool, optional): enables Nesterov momentum (default: False)

        Example:
            >>> optimizer = torch.optim.SGD(lr=0.1, momentum=0.9)
            >>> optimizer(model.parameters())
            >>> optimizer.zero_grad()
            >>> loss_fn(model(input), target).backward()
            >>> optimizer.step(),

        __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf

        .. note::
            The implementation of SGD with Momentum/Nesterov subtly differs from
            Sutskever et. al. and implementations in some other frameworks.

            Considering the specific case of Momentum, the update can be written as

            .. math::
                      v = \rho * v + g \\
                      p = p - lr * v

            where p, g, v and :math:`\rho` denote the parameters, gradient,
            velocity, and momentum respectively.

            This is in contrast to Sutskever et. al. and
            other frameworks which employ an update of the form

            .. math::
                 v = \rho * v + lr * g \\
                 p = p - v

            The Nesterov version is analogously modified.
        """

        super().__init__(optim.SGD,
                         lr=lr,
                         momentum=momentum,
                         dampening=dampening,
                         weight_decay=weight_decay,
                         nesterov=nesterov)