``````

using

Flux

using

MacroTools
:

@
forward

abstract

type

AbstractOptimiser

end

const

EPS

=

1e-8``````

TODO: should use weak refs

``````

"""

Descent(η = 0.1)

Classic gradient descent optimiser with learning rate `η`.

For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

# Examples

```julia

opt = Descent()

opt = Descent(0.3)

ps = Flux.params(model)

loss(x, y)

end

Flux.Optimise.update!(opt, ps, gs)

```

"""

mutable

struct

Descent

<:

AbstractOptimiser

eta
::
Float64

end

Descent
(
)

=

Descent
(
0.1
)

function

apply!
(

o
::

Descent
,

x
,

Δ
)

Δ

.*=

o
.

eta

end

"""

Momentum(η = 0.01, ρ = 0.9)

Gradient descent optimiser with learning rate `η` and momentum `ρ`.

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Momentum (`ρ`): Controls the acceleration of gradient descent in the

prominent direction, in effect damping oscillations.

# Examples

```julia

opt = Momentum()

opt = Momentum(0.01, 0.99)

```

"""

mutable

struct

Momentum

<:

AbstractOptimiser

eta
::
Float64

rho
::
Float64

velocity
::
IdDict

end

Momentum
(

η

=

0.01
,

ρ

=

0.9
)

=

Momentum
(
η
,

ρ
,

IdDict
(
)
)

function

apply!
(

o
::

Momentum
,

x
,

Δ
)

η
,

ρ

=

o
.

eta
,

o
.

rho

v

=

get!
(

(
)

->

zero
(
x
)
,

o
.

velocity
,

x
)
::

typeof
(
x
)

@
.

v

=

ρ

*

v

-

η

*

Δ

@
.

Δ

=

-
v

end

"""

Nesterov(η = 0.001, ρ = 0.9)

Gradient descent optimiser with learning rate `η` and Nesterov momentum `ρ`.

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the

prominent direction, in effect damping oscillations.

# Examples

```julia

opt = Nesterov()

opt = Nesterov(0.003, 0.95)

```

"""

mutable

struct

Nesterov

<:

AbstractOptimiser

eta
::
Float64

rho
::
Float64

velocity
::
IdDict

end

Nesterov
(

η

=

0.001
,

ρ

=

0.9
)

=

Nesterov
(
η
,

ρ
,

IdDict
(
)
)

function

apply!
(

o
::

Nesterov
,

x
,

Δ
)

η
,

ρ

=

o
.

eta
,

o
.

rho

v

=

get!
(

(
)

->

zero
(
x
)
,

o
.

velocity
,

x
)
::

typeof
(
x
)

d

=

@
.

ρ
^
2

*

v

-

(

1
+
ρ
)

*

η

*

Δ

@
.

v

=

ρ
*
v

-

η
*
Δ

@
.

Δ

=

-
d

end

"""

RMSProp(η = 0.001, ρ = 0.9, ϵ =
\$

EPS
)

Optimizer using the

[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)

algorithm. Often a good choice for recurrent networks. Parameters other than learning rate

generally don't need tuning.

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Momentum (`ρ`): Controls the acceleration of gradient descent in the

prominent direction, in effect damping oscillations.

# Examples

```julia

opt = RMSProp()

opt = RMSProp(0.002, 0.95)

```

"""

mutable

struct

RMSProp

<:

AbstractOptimiser

eta
::
Float64

rho
::
Float64

epsilon
::
Float64

acc
::
IdDict

end

RMSProp
(

η
::
Real

=

0.001
,

ρ
::
Real

=

0.9
,

ϵ
::
Real

=

EPS
)

=

RMSProp
(
η
,

ρ
,

ϵ
,

IdDict
(
)
)

RMSProp
(

η
::
Real
,

ρ
::
Real
,

acc
::
IdDict
)

=

RMSProp
(
η
,

ρ
,

EPS
,

acc
)

function

apply!
(

o
::

RMSProp
,

x
,

Δ
)

η
,

ρ

=

o
.

eta
,

o
.

rho

acc

=

get!
(

(
)

->

zero
(
x
)
,

o
.

acc
,

x
)
::

typeof
(
x
)

@
.

acc

=

ρ

*

acc

+

(

1

-

ρ
)

*

Δ

*

conj
(
Δ
)

@
.

Δ

*=

η

/

(

√
acc

+

o
.

epsilon
)

end

"""

Adam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
\$

EPS
)

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the

second (β2) momentum estimate.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

eta
::
Float64

beta
::

Tuple
{
Float64
,
Float64
}

epsilon
::
Float64

state
::

IdDict
{
Any
,

Any
}

end

(

η
::
Real

=

0.001
,

β
::
Tuple

=

(
0.9
,

0.999
)
,

ϵ
::
Real

=

EPS
)

=

(
η
,

β
,

ϵ
,

IdDict
(
)
)

(

η
::
Real
,

β
::
Tuple
,

state
::
IdDict
)

=

(
η
,

β
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

η
,

β

=

o
.

eta
,

o
.

beta

mt
,

vt
,

βp

=

get!
(

o
.

state
,

x
)

do

(

zero
(
x
)
,

zero
(
x
)
,

Float64
[

β
[
1
]
,

β
[
2
]
]
)

end

::

Tuple
{

typeof
(
x
)
,

typeof
(
x
)
,

Vector
{
Float64
}
}

@
.

mt

=

β
[
1
]

*

mt

+

(

1

-

β
[
1
]
)

*

Δ

@
.

vt

=

β
[
2
]

*

vt

+

(

1

-

β
[
2
]
)

*

Δ

*

conj
(
Δ
)

@
.

Δ

=

mt

/

(

1

-

βp
[
1
]
)

/

(

√
(

vt

/

(

1

-

βp
[
2
]
)
)

+

o
.

epsilon
)

*

η

βp

.=

βp

.*

β

return

Δ

end

"""

RAdam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
\$

EPS
)

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the

second (β2) momentum estimate.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

eta
::
Float64

beta
::

Tuple
{
Float64
,
Float64
}

epsilon
::
Float64

state
::

IdDict
{
Any
,

Any
}

end

(

η
::
Real

=

0.001
,

β
::
Tuple

=

(
0.9
,

0.999
)
,

ϵ
::
Real

=

EPS
)

=

(
η
,

β
,

ϵ
,

IdDict
(
)
)

(

η
::
Real
,

β
::
Tuple
,

state
::
IdDict
)

=

(
η
,

β
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

η
,

β

=

o
.

eta
,

o
.

beta

ρ∞

=

2
/
(

1
-

β
[
2
]
)
-
1

mt
,

vt
,

βp
,

t

=

get!
(

o
.

state
,

x
)

do

(

zero
(
x
)
,

zero
(
x
)
,

Float64
[

β
[
1
]
,

β
[
2
]
]
,

Ref
(
1
)
)

end

::

Tuple
{

typeof
(
x
)
,

typeof
(
x
)
,

Vector
{
Float64
}
,

Base
.

RefValue
{
Int
}
}

@
.

mt

=

β
[
1
]

*

mt

+

(

1

-

β
[
1
]
)

*

Δ

@
.

vt

=

β
[
2
]

*

vt

+

(

1

-

β
[
2
]
)

*

Δ

*

conj
(
Δ
)

ρ

=

ρ∞

-

2

t
[
]

*

βp
[
2
]

/

(

1

-

βp
[
2
]
)

if

ρ

>

4

r

=

sqrt
(

(

ρ
-
4
)
*
(

ρ
-
2
)
*
ρ∞
/
(

(

ρ∞
-
4
)
*
(

ρ∞
-
2
)
*
ρ
)
)

@
.

Δ

=

mt

/

(

1

-

βp
[
1
]
)

/

(

√
(

vt

/

(

1

-

βp
[
2
]
)
)

+

o
.

epsilon
)

*

η

*

r

else

@
.

Δ

=

mt

/

(

1

-

βp
[
1
]
)

*

η

end

βp

.=

βp

.*

β

t
[
]

+=

1

return

Δ

end

"""

AdaMax(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
\$

EPS
)

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the

second (β2) momentum estimate.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

eta
::
Float64

beta
::

Tuple
{
Float64
,
Float64
}

epsilon
::
Float64

state
::

IdDict
{
Any
,

Any
}

end

(

η
::
Real

=

0.001
,

β
::
Tuple

=

(
0.9
,

0.999
)
,

ϵ
::
Real

=

EPS
)

=

(
η
,

β
,

ϵ
,

IdDict
(
)
)

(

η
::
Real
,

β
::
Tuple
,

state
::
IdDict
)

=

(
η
,

β
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

η
,

β

=

o
.

eta
,

o
.

beta

mt
,

ut
,

βp

=

get!
(

o
.

state
,

x
)

do

(

zero
(
x
)
,

zero
(
x
)
,

Float64
[

β
[
1
]
,

β
[
2
]
]
)

end

::

Tuple
{

typeof
(
x
)
,

typeof
(
x
)
,

Vector
{
Float64
}
}

@
.

mt

=

β
[
1
]

*

mt

+

(

1

-

β
[
1
]
)

*

Δ

@
.

ut

=

max
(

β
[
2
]

*

ut
,

abs
(
Δ
)
)

@
.

Δ

=

(

η
/
(

1

-

βp
[
1
]
)
)

*

mt
/
(

ut

+

o
.

epsilon
)

βp

.=

βp

.*

β

return

Δ

end

"""

OAdam(η = 0.0001, β::Tuple = (0.5, 0.9), ϵ =
\$

EPS
)

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the

second (β2) momentum estimate.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

eta
::
Float64

beta
::

Tuple
{
Float64
,
Float64
}

epsilon
::
Float64

state
::

IdDict
{
Any
,

Any
}

end

(

η
::
Real

=

0.001
,

β
::
Tuple

=

(
0.5
,

0.9
)
,

ϵ
::
Real

=

EPS
)

=

(
η
,

β
,

ϵ
,

IdDict
(
)
)

(

η
::
Real
,

β
::
Tuple
,

state
::
IdDict
)

=

RMSProp
(
η
,

β
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

η
,

β

=

o
.

eta
,

o
.

beta

mt
,

vt
,

Δ_
,

βp

=

get!
(

o
.

state
,

x
)

do

(

zero
(
x
)
,

zero
(
x
)
,

zero
(
x
)
,

Float64
[

β
[
1
]
,

β
[
2
]
]
)

end

::

Tuple
{

typeof
(
x
)
,

typeof
(
x
)
,

typeof
(
x
)
,

Vector
{
Float64
}
}

@
.

mt

=

β
[
1
]

*

mt

+

(

1

-

β
[
1
]
)

*

Δ

@
.

vt

=

β
[
2
]

*

vt

+

(

1

-

β
[
2
]
)

*

Δ

*

conj
(
Δ
)

@
.

Δ

=

-
Δ_

@
.

Δ_

=

η

*

mt

/

(

1

-

βp
[
1
]
)

/

(

√
(

vt

/

(

1

-

βp
[
2
]
)
)

+

o
.

epsilon
)

@
.

Δ

+=

2

Δ_

βp

.=

βp

.*

β

return

Δ

end

"""

\$

EPS
)

parameter specific learning rates based on how frequently it is updated.

Parameters don't need tuning.

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

eta
::
Float64

epsilon
::
Float64

acc
::
IdDict

end

(

η
::
Real

=

0.1
,

ϵ
::
Real

=

EPS
)

=

(
η
,

ϵ
,

IdDict
(
)
)

(

η
::
Real
,

state
::
IdDict
)

=

(
η
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

η

=

o
.

eta

acc

=

get!
(

(
)

->

fill!
(

similar
(
x
)
,

o
.

epsilon
)
,

o
.

acc
,

x
)
::

typeof
(
x
)

@
.

acc

+=

Δ

*

conj
(
Δ
)

@
.

Δ

*=

η

/

(

√
acc

+

o
.

epsilon
)

end

"""

\$

EPS
)

Parameters don't need tuning.

# Parameters

- Rho (`ρ`): Factor by which the gradient is decayed at each time step.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

rho
::
Float64

epsilon
::
Float64

state
::

IdDict
{
Any
,

Any
}

end

(

ρ
::
Real

=

0.9
,

ϵ
::
Real

=

EPS
)

=

(
ρ
,

ϵ
,

IdDict
(
)
)

(

ρ
::
Real
,

state
::
IdDict
)

=

(
ρ
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

ρ

=

o
.

rho

acc
,

Δacc

=

get!
(

(
)

->

(

zero
(
x
)
,

zero
(
x
)
)
,

o
.

state
,

x
)
::

NTuple
{
2
,

typeof
(
x
)
}

@
.

acc

=

ρ

*

acc

+

(

1

-

ρ
)

*

Δ

*

conj
(
Δ
)

# DON'T remove epsilon from numerator

# or even out of the square roots

@
.

Δ

*=

√
(

Δacc

+

o
.

epsilon
)

/

√
(

acc

+

o
.

epsilon
)

@
.

Δacc

=

ρ

*

Δacc

+

(

1

-

ρ
)

*

Δ

*

conj
(
Δ
)

return

Δ

end

"""

AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
\$

EPS
)

optimiser. Parameters don't need tuning.

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the

second (β2) momentum estimate.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

eta
::
Float64

beta
::

Tuple
{
Float64
,

Float64
}

epsilon
::
Float64

state
::

IdDict
{
Any
,

Any
}

end

(

η
::
Real

=

0.001
,

β

=

(
0.9
,

0.999
)
,

ϵ
::
Real

=

EPS
)

=

(
η
,

β
,

ϵ
,

IdDict
(
)
)

(

η
::
Real
,

β
::
Tuple
,

state
::
IdDict
)

=

(
η
,

β
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

η
,

β

=

o
.

eta
,

o
.

beta

mt
,

vt
,

v̂t

=

get!
(

o
.

state
,

x
)

do

(

fill!
(

similar
(
x
)
,

o
.

epsilon
)
,

fill!
(

similar
(
x
)
,

o
.

epsilon
)
,

fill!
(

similar
(
x
)
,

o
.

epsilon
)
)

end

::

NTuple
{
3
,

typeof
(
x
)
}

@
.

mt

=

β
[
1
]

*

mt

+

(

1

-

β
[
1
]
)

*

Δ

@
.

vt

=

β
[
2
]

*

vt

+

(

1

-

β
[
2
]
)

*

Δ

^

2

@
.

v̂t

=

max
(
v̂t
,

vt
)

@
.

Δ

=

η

*

mt

/

(

√
v̂t

+

o
.

epsilon
)

end

"""

NAdam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
\$

EPS
)

Parameters don't need tuning.

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the

second (β2) momentum estimate.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

eta
::
Float64

beta
::

Tuple
{
Float64
,

Float64
}

epsilon
::
Float64

state
::

IdDict
{
Any
,

Any
}

end

(

η
::
Real

=

0.001
,

β

=

(
0.9
,

0.999
)
,

ϵ
::
Real

=

EPS
)

=

(
η
,

β
,

ϵ
,

IdDict
(
)
)

(

η
::
Real
,

β
::
Tuple
,

state
::
IdDict
)

=

(
η
,

β
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

η
,

β

=

o
.

eta
,

o
.

beta

mt
,

vt
,

βp

=

get!
(

o
.

state
,

x
)

do

(

zero
(
x
)
,

zero
(
x
)
,

Float64
[

o
.

beta
[
1
]
,

o
.

beta
[
2
]
]
)

end

::

Tuple
{

typeof
(
x
)
,

typeof
(
x
)
,

Vector
{
Float64
}
}

β1p
,

β2p

=

βp

@
.

mt

=

β
[
1
]

*

mt

+

(

1

-

β
[
1
]
)

*

Δ

@
.

vt

=

β
[
2
]

*

vt

+

(

1

-

β
[
2
]
)

*

Δ

*

conj
(
Δ
)

@
.

Δ

=

(

β
[
1
]

*

mt

/

(

1

-

β
[
1
]

*

β1p
)

+

(

1

-

β
[
1
]
)

*

Δ

/

(

1

-

β1p
)
)

/

(

√
(

vt

*

β
[
2
]

/

(

1

-

β2p
)
)

+

o
.

epsilon
)

*

η

βp

.=

βp

.*

β

return

Δ

end

"""

AdamW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)

weight decay regularization.

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the

second (β2) momentum estimate.

- `decay`: Decay applied to weights during optimisation.

# Examples

```julia

opt = AdamW(0.001, (0.89, 0.995), 0.1)

```

"""

(

η

=

0.001
,

β

=

(
0.9
,

0.999
)
,

decay

=

0
)

=

Optimiser
(

(
η
,

β
)
,

WeightDecay
(
decay
)
)

"""

AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
\$

EPS
)

The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the

second (β2) momentum estimate.

# Examples

```julia

```

"""

mutable

struct

<:

AbstractOptimiser

eta
::
Float64

beta
::

Tuple
{
Float64
,
Float64
}

epsilon
::
Float64

state
::

IdDict
{
Any
,

Any
}

end

(

η
::
Real

=

0.001
,

β

=

(
0.9
,

0.999
)
,

ϵ
::
Real

=

EPS
)

=

(
η
,

β
,

ϵ
,

IdDict
(
)
)

(

η
::
Real
,

β
::
Tuple
,

state
::
IdDict
)

=

(
η
,

β
,

EPS
,

state
)

function

apply!
(

o
::

,

x
,

Δ
)

η
,

β

=

o
.

eta
,

o
.

beta

mt
,

st
,

βp

=

get!
(

o
.

state
,

x
)

do

(

zero
(
x
)
,

zero
(
x
)
,

Float64
[

β
[
1
]
,

β
[
2
]
]
)

end

::

Tuple
{

typeof
(
x
)
,

typeof
(
x
)
,

Vector
{
Float64
}
}

#= st is a variance and can go to zero. This is in contrast to Adam, which uses the
second moment which is usually far enough from zero. This is problematic, since st
can be slightly negative due to numerical error, and the square root below will fail.
Also, if we want to differentiate through the optimiser, √0 is not differentiable.
To protect against this, we add a small number, st -> st + eps2.
uses the square of Adam's epsilon, which we do here.

eps2

=

o
.

epsilon
^
2

# TODO: make epsilon^2 the default in next breaking release

@
.

mt

=

β
[
1
]

*

mt

+

(

1

-

β
[
1
]
)

*

Δ

@
.

st

=

β
[
2
]

*

st

+

(

1

-

β
[
2
]
)

*

(

Δ

-

mt
)

*

conj
(

Δ

-

mt
)

+

eps2

@
.

Δ

=

η

*

mt

/

(

1

-

βp
[
1
]
)

/

(

√
(

st

/

(

1

-

βp
[
2
]
)
)

+

eps2
)

βp

.=

βp

.*

β

return

Δ

end``````

Compose optimisers

``````

"""

Optimiser(a, b, c...)

Combine several optimisers into one; each optimiser produces a modified gradient

that will be fed into the next, and this is finally applied to the parameter as

usual.

!!! note

This will be replaced by `Optimisers.OptimiserChain` in Flux 0.14.

"""

mutable

struct

Optimiser

<:

AbstractOptimiser

os
::

Vector
{
Any
}

end

Optimiser
(

opts
::

AbstractOptimiser
...
)

=

Optimiser
(

Any
[

opts
...
]
)

@
forward

Optimiser
.

os

Base
.

getindex
,

Base
.

first
,

Base
.

last
,

Base
.

lastindex
,

Base
.

push!
,

Base
.

setindex!

@
forward

Optimiser
.

os

Base
.

iterate

Base
.

getindex
(

c
::

Optimiser
,

i
::
AbstractArray
)

=

Optimiser
(

c
.

os
[
i
]
...
)

function

apply!
(

o
::

Optimiser
,

x
,

Δ
)

for

opt

in

o
.

os

Δ

=

apply!
(
opt
,

x
,

Δ
)

end

return

Δ

end

"""

InvDecay(γ = 0.001)

Apply inverse time decay to an optimiser, so that the effective step size at

iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.

The wrapped optimiser's step size is not modified.

for more general scheduling techniques.

# Examples

`InvDecay` is typically composed  with other optimisers

as the last transformation of the gradient:

```julia

# Inverse decay of the learning rate

# with starting value 0.001 and decay coefficient 0.01.

```

"""

mutable

struct

InvDecay

<:

AbstractOptimiser

gamma
::
Float64

state
::

IdDict
{
Any
,

Int
}

end

InvDecay
(

γ

=

0.001
)

=

InvDecay
(
γ
,

IdDict
{
Any
,

Int
}
(
)
)

function

apply!
(

o
::

InvDecay
,

x
,

Δ
)

γ

=

o
.

gamma

n

=

get!
(

o
.

state
,

x
,

1
)

Δ

.*=

1

/

(

1

+

γ

*

n
)

o
.

state
[
x
]

=

n

+

1

return

Δ

end

"""

ExpDecay(η = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4, start = 1)

Discount the learning rate `η` by the factor `decay` every `decay_step` steps till

a minimum of `clip`.

# Parameters

- Learning rate (`η`): Amount by which gradients are discounted before updating

the weights.

- `decay`: Factor by which the learning rate is discounted.

- `decay_step`: Schedule decay operations by setting the number of steps between

two decay operations.

- `clip`: Minimum value of learning rate.

- 'start': Step at which the decay starts.

for more general scheduling techniques.

# Examples

`ExpDecay` is typically composed  with other optimisers

as the last transformation of the gradient:

```julia

```

Note: you may want to start with `η=1` in `ExpDecay` when combined with other

optimisers (`Adam` in this case) that have their own learning rate.

"""

mutable

struct

ExpDecay

<:

AbstractOptimiser

eta
::
Float64

decay
::
Float64

step
::
Int64

clip
::
Float64

start
::
Int64

current
::
IdDict

end

ExpDecay
(

opt

=

0.001
,

decay

=

0.1
,

decay_step

=

1000
,

clip

=

1e-4
,

start

=

0
)

=

ExpDecay
(
opt
,

decay
,

decay_step
,

clip
,

start
,

IdDict
(
)
)

function

apply!
(

o
::

ExpDecay
,

x
,

Δ
)

η
,

s
,

decay
,

start

=

o
.

eta
,

o
.

step
,

o
.

decay
,

o
.

start

n

=

o
.

current
[
x
]

=

get
(

o
.

current
,

x
,

0
)

+

1

if

n

>

start

&&

n

%

s

==

0

&&

count
(

x

->

x

>

start

&&

x

%

s

==

0
,

values
(

o
.

current
)
)

==

1

η

=

max
(

η

*

decay
,

o
.

clip
)

o
.

eta

=

η

end

@
.

Δ

*=

η

end

"""

WeightDecay(λ = 0)

Decay weights by ``λ``.

Typically composed  with other optimisers as the first transformation to the gradient,

making it equivalent to adding ``L_2`` regularization

with coefficient  ``λ`` to the loss.

# Examples

```julia

```

"""

mutable

struct

WeightDecay

<:

AbstractOptimiser

wd
::
Real

end

WeightDecay
(
)

=

WeightDecay
(
0
)

function

apply!
(

o
::

WeightDecay
,

x
,

Δ
)

wd

=

o
.

wd

@
.

Δ

+=

wd

*

x

end

"""

ClipValue(thresh)

Clip gradients when their absolute value exceeds `thresh`.

!!! note

This will be replaced by `Optimisers.ClipGrad` in Flux 0.14.

"""

mutable

struct

ClipValue
{
T
}

<:

AbstractOptimiser

thresh
::
T

end

apply!
(

o
::

ClipValue
,

x
,

Δ
)

=

clamp!
(
Δ
,

-

o
.

thresh
,

o
.

thresh
)

"""

ClipNorm(thresh)

Clip gradients when their L2 norm exceeds `thresh`.

"""

mutable

struct

ClipNorm
{
T
}

<:

AbstractOptimiser

thresh
::
T

end

function

apply!
(

o
::

ClipNorm
,

x
,

Δ
)

Δnrm

=

norm
(
Δ
)

if

Δnrm

>

o
.

thresh

rmul!
(
Δ
,

o
.

thresh

/

Δnrm
)

end

return

Δ

end``````