Flux
using
Flux
using
MacroTools
:
@
forward
abstract
type
AbstractOptimiser
end
const
EPS
=
1e-8
TODO: should use weak refs
"""
Descent(η = 0.1)
Classic gradient descent optimiser with learning rate `η`.
For each parameter `p` and its gradient `δp`, this runs `p -= η*δp`
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
# Examples
```julia
opt = Descent()
opt = Descent(0.3)
ps = Flux.params(model)
gs = gradient(ps) do
loss(x, y)
end
Flux.Optimise.update!(opt, ps, gs)
```
"""
mutable
struct
Descent
<:
AbstractOptimiser
eta
::
Float64
end
Descent
(
)
=
Descent
(
0.1
)
function
apply!
(
o
::
Descent
,
x
,
Δ
)
Δ
.*=
o
.
eta
end
"""
Momentum(η = 0.01, ρ = 0.9)
Gradient descent optimiser with learning rate `η` and momentum `ρ`.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Momentum (`ρ`): Controls the acceleration of gradient descent in the
prominent direction, in effect damping oscillations.
# Examples
```julia
opt = Momentum()
opt = Momentum(0.01, 0.99)
```
"""
mutable
struct
Momentum
<:
AbstractOptimiser
eta
::
Float64
rho
::
Float64
velocity
::
IdDict
end
Momentum
(
η
=
0.01
,
ρ
=
0.9
)
=
Momentum
(
η
,
ρ
,
IdDict
(
)
)
function
apply!
(
o
::
Momentum
,
x
,
Δ
)
η
,
ρ
=
o
.
eta
,
o
.
rho
v
=
get!
(
(
)
->
zero
(
x
)
,
o
.
velocity
,
x
)
::
typeof
(
x
)
@
.
v
=
ρ
*
v
-
η
*
Δ
@
.
Δ
=
-
v
end
"""
Nesterov(η = 0.001, ρ = 0.9)
Gradient descent optimiser with learning rate `η` and Nesterov momentum `ρ`.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
prominent direction, in effect damping oscillations.
# Examples
```julia
opt = Nesterov()
opt = Nesterov(0.003, 0.95)
```
"""
mutable
struct
Nesterov
<:
AbstractOptimiser
eta
::
Float64
rho
::
Float64
velocity
::
IdDict
end
Nesterov
(
η
=
0.001
,
ρ
=
0.9
)
=
Nesterov
(
η
,
ρ
,
IdDict
(
)
)
function
apply!
(
o
::
Nesterov
,
x
,
Δ
)
η
,
ρ
=
o
.
eta
,
o
.
rho
v
=
get!
(
(
)
->
zero
(
x
)
,
o
.
velocity
,
x
)
::
typeof
(
x
)
d
=
@
.
ρ
^
2
*
v
-
(
1
+
ρ
)
*
η
*
Δ
@
.
v
=
ρ
*
v
-
η
*
Δ
@
.
Δ
=
-
d
end
"""
RMSProp(η = 0.001, ρ = 0.9, ϵ =
$
EPS
)
Optimizer using the
[RMSProp](https://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
algorithm. Often a good choice for recurrent networks. Parameters other than learning rate
generally don't need tuning.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Momentum (`ρ`): Controls the acceleration of gradient descent in the
prominent direction, in effect damping oscillations.
# Examples
```julia
opt = RMSProp()
opt = RMSProp(0.002, 0.95)
```
"""
mutable
struct
RMSProp
<:
AbstractOptimiser
eta
::
Float64
rho
::
Float64
epsilon
::
Float64
acc
::
IdDict
end
RMSProp
(
η
::
Real
=
0.001
,
ρ
::
Real
=
0.9
,
ϵ
::
Real
=
EPS
)
=
RMSProp
(
η
,
ρ
,
ϵ
,
IdDict
(
)
)
RMSProp
(
η
::
Real
,
ρ
::
Real
,
acc
::
IdDict
)
=
RMSProp
(
η
,
ρ
,
EPS
,
acc
)
function
apply!
(
o
::
RMSProp
,
x
,
Δ
)
η
,
ρ
=
o
.
eta
,
o
.
rho
acc
=
get!
(
(
)
->
zero
(
x
)
,
o
.
acc
,
x
)
::
typeof
(
x
)
@
.
acc
=
ρ
*
acc
+
(
1
-
ρ
)
*
Δ
*
conj
(
Δ
)
@
.
Δ
*=
η
/
(
√
acc
+
o
.
epsilon
)
end
"""
Adam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
$
EPS
)
[Adam](https://arxiv.org/abs/1412.6980) optimiser.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples
```julia
opt = Adam()
opt = Adam(0.001, (0.9, 0.8))
```
"""
mutable
struct
Adam
<:
AbstractOptimiser
eta
::
Float64
beta
::
Tuple
{
Float64
,
Float64
}
epsilon
::
Float64
state
::
IdDict
{
Any
,
Any
}
end
Adam
(
η
::
Real
=
0.001
,
β
::
Tuple
=
(
0.9
,
0.999
)
,
ϵ
::
Real
=
EPS
)
=
Adam
(
η
,
β
,
ϵ
,
IdDict
(
)
)
Adam
(
η
::
Real
,
β
::
Tuple
,
state
::
IdDict
)
=
Adam
(
η
,
β
,
EPS
,
state
)
function
apply!
(
o
::
Adam
,
x
,
Δ
)
η
,
β
=
o
.
eta
,
o
.
beta
mt
,
vt
,
βp
=
get!
(
o
.
state
,
x
)
do
(
zero
(
x
)
,
zero
(
x
)
,
Float64
[
β
[
1
]
,
β
[
2
]
]
)
end
::
Tuple
{
typeof
(
x
)
,
typeof
(
x
)
,
Vector
{
Float64
}
}
@
.
mt
=
β
[
1
]
*
mt
+
(
1
-
β
[
1
]
)
*
Δ
@
.
vt
=
β
[
2
]
*
vt
+
(
1
-
β
[
2
]
)
*
Δ
*
conj
(
Δ
)
@
.
Δ
=
mt
/
(
1
-
βp
[
1
]
)
/
(
√
(
vt
/
(
1
-
βp
[
2
]
)
)
+
o
.
epsilon
)
*
η
βp
.=
βp
.*
β
return
Δ
end
"""
RAdam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
$
EPS
)
[Rectified Adam](https://arxiv.org/abs/1908.03265) optimiser.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples
```julia
opt = RAdam()
opt = RAdam(0.001, (0.9, 0.8))
```
"""
mutable
struct
RAdam
<:
AbstractOptimiser
eta
::
Float64
beta
::
Tuple
{
Float64
,
Float64
}
epsilon
::
Float64
state
::
IdDict
{
Any
,
Any
}
end
RAdam
(
η
::
Real
=
0.001
,
β
::
Tuple
=
(
0.9
,
0.999
)
,
ϵ
::
Real
=
EPS
)
=
RAdam
(
η
,
β
,
ϵ
,
IdDict
(
)
)
RAdam
(
η
::
Real
,
β
::
Tuple
,
state
::
IdDict
)
=
RAdam
(
η
,
β
,
EPS
,
state
)
function
apply!
(
o
::
RAdam
,
x
,
Δ
)
η
,
β
=
o
.
eta
,
o
.
beta
ρ∞
=
2
/
(
1
-
β
[
2
]
)
-
1
mt
,
vt
,
βp
,
t
=
get!
(
o
.
state
,
x
)
do
(
zero
(
x
)
,
zero
(
x
)
,
Float64
[
β
[
1
]
,
β
[
2
]
]
,
Ref
(
1
)
)
end
::
Tuple
{
typeof
(
x
)
,
typeof
(
x
)
,
Vector
{
Float64
}
,
Base
.
RefValue
{
Int
}
}
@
.
mt
=
β
[
1
]
*
mt
+
(
1
-
β
[
1
]
)
*
Δ
@
.
vt
=
β
[
2
]
*
vt
+
(
1
-
β
[
2
]
)
*
Δ
*
conj
(
Δ
)
ρ
=
ρ∞
-
2
t
[
]
*
βp
[
2
]
/
(
1
-
βp
[
2
]
)
if
ρ
>
4
r
=
sqrt
(
(
ρ
-
4
)
*
(
ρ
-
2
)
*
ρ∞
/
(
(
ρ∞
-
4
)
*
(
ρ∞
-
2
)
*
ρ
)
)
@
.
Δ
=
mt
/
(
1
-
βp
[
1
]
)
/
(
√
(
vt
/
(
1
-
βp
[
2
]
)
)
+
o
.
epsilon
)
*
η
*
r
else
@
.
Δ
=
mt
/
(
1
-
βp
[
1
]
)
*
η
end
βp
.=
βp
.*
β
t
[
]
+=
1
return
Δ
end
"""
AdaMax(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
$
EPS
)
[AdaMax](https://arxiv.org/abs/1412.6980) is a variant of Adam based on the ∞-norm.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples
```julia
opt = AdaMax()
opt = AdaMax(0.001, (0.9, 0.995))
```
"""
mutable
struct
AdaMax
<:
AbstractOptimiser
eta
::
Float64
beta
::
Tuple
{
Float64
,
Float64
}
epsilon
::
Float64
state
::
IdDict
{
Any
,
Any
}
end
AdaMax
(
η
::
Real
=
0.001
,
β
::
Tuple
=
(
0.9
,
0.999
)
,
ϵ
::
Real
=
EPS
)
=
AdaMax
(
η
,
β
,
ϵ
,
IdDict
(
)
)
AdaMax
(
η
::
Real
,
β
::
Tuple
,
state
::
IdDict
)
=
AdaMax
(
η
,
β
,
EPS
,
state
)
function
apply!
(
o
::
AdaMax
,
x
,
Δ
)
η
,
β
=
o
.
eta
,
o
.
beta
mt
,
ut
,
βp
=
get!
(
o
.
state
,
x
)
do
(
zero
(
x
)
,
zero
(
x
)
,
Float64
[
β
[
1
]
,
β
[
2
]
]
)
end
::
Tuple
{
typeof
(
x
)
,
typeof
(
x
)
,
Vector
{
Float64
}
}
@
.
mt
=
β
[
1
]
*
mt
+
(
1
-
β
[
1
]
)
*
Δ
@
.
ut
=
max
(
β
[
2
]
*
ut
,
abs
(
Δ
)
)
@
.
Δ
=
(
η
/
(
1
-
βp
[
1
]
)
)
*
mt
/
(
ut
+
o
.
epsilon
)
βp
.=
βp
.*
β
return
Δ
end
"""
OAdam(η = 0.0001, β::Tuple = (0.5, 0.9), ϵ =
$
EPS
)
[OAdam](https://arxiv.org/abs/1711.00141) (Optimistic Adam)
is a variant of Adam adding an "optimistic" term suitable for adversarial training.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples
```julia
opt = OAdam()
opt = OAdam(0.001, (0.9, 0.995))
```
"""
mutable
struct
OAdam
<:
AbstractOptimiser
eta
::
Float64
beta
::
Tuple
{
Float64
,
Float64
}
epsilon
::
Float64
state
::
IdDict
{
Any
,
Any
}
end
OAdam
(
η
::
Real
=
0.001
,
β
::
Tuple
=
(
0.5
,
0.9
)
,
ϵ
::
Real
=
EPS
)
=
OAdam
(
η
,
β
,
ϵ
,
IdDict
(
)
)
OAdam
(
η
::
Real
,
β
::
Tuple
,
state
::
IdDict
)
=
RMSProp
(
η
,
β
,
EPS
,
state
)
function
apply!
(
o
::
OAdam
,
x
,
Δ
)
η
,
β
=
o
.
eta
,
o
.
beta
mt
,
vt
,
Δ_
,
βp
=
get!
(
o
.
state
,
x
)
do
(
zero
(
x
)
,
zero
(
x
)
,
zero
(
x
)
,
Float64
[
β
[
1
]
,
β
[
2
]
]
)
end
::
Tuple
{
typeof
(
x
)
,
typeof
(
x
)
,
typeof
(
x
)
,
Vector
{
Float64
}
}
@
.
mt
=
β
[
1
]
*
mt
+
(
1
-
β
[
1
]
)
*
Δ
@
.
vt
=
β
[
2
]
*
vt
+
(
1
-
β
[
2
]
)
*
Δ
*
conj
(
Δ
)
@
.
Δ
=
-
Δ_
@
.
Δ_
=
η
*
mt
/
(
1
-
βp
[
1
]
)
/
(
√
(
vt
/
(
1
-
βp
[
2
]
)
)
+
o
.
epsilon
)
@
.
Δ
+=
2
Δ_
βp
.=
βp
.*
β
return
Δ
end
"""
AdaGrad(η = 0.1, ϵ =
$
EPS
)
[AdaGrad](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) optimiser. It has
parameter specific learning rates based on how frequently it is updated.
Parameters don't need tuning.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
# Examples
```julia
opt = AdaGrad()
opt = AdaGrad(0.001)
```
"""
mutable
struct
AdaGrad
<:
AbstractOptimiser
eta
::
Float64
epsilon
::
Float64
acc
::
IdDict
end
AdaGrad
(
η
::
Real
=
0.1
,
ϵ
::
Real
=
EPS
)
=
AdaGrad
(
η
,
ϵ
,
IdDict
(
)
)
AdaGrad
(
η
::
Real
,
state
::
IdDict
)
=
AdaGrad
(
η
,
EPS
,
state
)
function
apply!
(
o
::
AdaGrad
,
x
,
Δ
)
η
=
o
.
eta
acc
=
get!
(
(
)
->
fill!
(
similar
(
x
)
,
o
.
epsilon
)
,
o
.
acc
,
x
)
::
typeof
(
x
)
@
.
acc
+=
Δ
*
conj
(
Δ
)
@
.
Δ
*=
η
/
(
√
acc
+
o
.
epsilon
)
end
"""
AdaDelta(ρ = 0.9, ϵ =
$
EPS
)
[AdaDelta](https://arxiv.org/abs/1212.5701) is a version of AdaGrad adapting its learning
rate based on a window of past gradient updates.
Parameters don't need tuning.
# Parameters
- Rho (`ρ`): Factor by which the gradient is decayed at each time step.
# Examples
```julia
opt = AdaDelta()
opt = AdaDelta(0.89)
```
"""
mutable
struct
AdaDelta
<:
AbstractOptimiser
rho
::
Float64
epsilon
::
Float64
state
::
IdDict
{
Any
,
Any
}
end
AdaDelta
(
ρ
::
Real
=
0.9
,
ϵ
::
Real
=
EPS
)
=
AdaDelta
(
ρ
,
ϵ
,
IdDict
(
)
)
AdaDelta
(
ρ
::
Real
,
state
::
IdDict
)
=
AdaDelta
(
ρ
,
EPS
,
state
)
function
apply!
(
o
::
AdaDelta
,
x
,
Δ
)
ρ
=
o
.
rho
acc
,
Δacc
=
get!
(
(
)
->
(
zero
(
x
)
,
zero
(
x
)
)
,
o
.
state
,
x
)
::
NTuple
{
2
,
typeof
(
x
)
}
@
.
acc
=
ρ
*
acc
+
(
1
-
ρ
)
*
Δ
*
conj
(
Δ
)
# DON'T remove epsilon from numerator
# or even out of the square roots
@
.
Δ
*=
√
(
Δacc
+
o
.
epsilon
)
/
√
(
acc
+
o
.
epsilon
)
@
.
Δacc
=
ρ
*
Δacc
+
(
1
-
ρ
)
*
Δ
*
conj
(
Δ
)
return
Δ
end
"""
AMSGrad(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
$
EPS
)
The [AMSGrad](https://openreview.net/forum?id=ryQu7f-RZ) version of the Adam
optimiser. Parameters don't need tuning.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples
```julia
opt = AMSGrad()
opt = AMSGrad(0.001, (0.89, 0.995))
```
"""
mutable
struct
AMSGrad
<:
AbstractOptimiser
eta
::
Float64
beta
::
Tuple
{
Float64
,
Float64
}
epsilon
::
Float64
state
::
IdDict
{
Any
,
Any
}
end
AMSGrad
(
η
::
Real
=
0.001
,
β
=
(
0.9
,
0.999
)
,
ϵ
::
Real
=
EPS
)
=
AMSGrad
(
η
,
β
,
ϵ
,
IdDict
(
)
)
AMSGrad
(
η
::
Real
,
β
::
Tuple
,
state
::
IdDict
)
=
AMSGrad
(
η
,
β
,
EPS
,
state
)
function
apply!
(
o
::
AMSGrad
,
x
,
Δ
)
η
,
β
=
o
.
eta
,
o
.
beta
mt
,
vt
,
v̂t
=
get!
(
o
.
state
,
x
)
do
(
fill!
(
similar
(
x
)
,
o
.
epsilon
)
,
fill!
(
similar
(
x
)
,
o
.
epsilon
)
,
fill!
(
similar
(
x
)
,
o
.
epsilon
)
)
end
::
NTuple
{
3
,
typeof
(
x
)
}
@
.
mt
=
β
[
1
]
*
mt
+
(
1
-
β
[
1
]
)
*
Δ
@
.
vt
=
β
[
2
]
*
vt
+
(
1
-
β
[
2
]
)
*
Δ
^
2
@
.
v̂t
=
max
(
v̂t
,
vt
)
@
.
Δ
=
η
*
mt
/
(
√
v̂t
+
o
.
epsilon
)
end
"""
NAdam(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
$
EPS
)
[NAdam](https://openreview.net/forum?id=OM0jvwB8jIp57ZJjtNEZ) is a Nesterov variant of Adam.
Parameters don't need tuning.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples
```julia
opt = NAdam()
opt = NAdam(0.002, (0.89, 0.995))
```
"""
mutable
struct
NAdam
<:
AbstractOptimiser
eta
::
Float64
beta
::
Tuple
{
Float64
,
Float64
}
epsilon
::
Float64
state
::
IdDict
{
Any
,
Any
}
end
NAdam
(
η
::
Real
=
0.001
,
β
=
(
0.9
,
0.999
)
,
ϵ
::
Real
=
EPS
)
=
NAdam
(
η
,
β
,
ϵ
,
IdDict
(
)
)
NAdam
(
η
::
Real
,
β
::
Tuple
,
state
::
IdDict
)
=
NAdam
(
η
,
β
,
EPS
,
state
)
function
apply!
(
o
::
NAdam
,
x
,
Δ
)
η
,
β
=
o
.
eta
,
o
.
beta
mt
,
vt
,
βp
=
get!
(
o
.
state
,
x
)
do
(
zero
(
x
)
,
zero
(
x
)
,
Float64
[
o
.
beta
[
1
]
,
o
.
beta
[
2
]
]
)
end
::
Tuple
{
typeof
(
x
)
,
typeof
(
x
)
,
Vector
{
Float64
}
}
β1p
,
β2p
=
βp
@
.
mt
=
β
[
1
]
*
mt
+
(
1
-
β
[
1
]
)
*
Δ
@
.
vt
=
β
[
2
]
*
vt
+
(
1
-
β
[
2
]
)
*
Δ
*
conj
(
Δ
)
@
.
Δ
=
(
β
[
1
]
*
mt
/
(
1
-
β
[
1
]
*
β1p
)
+
(
1
-
β
[
1
]
)
*
Δ
/
(
1
-
β1p
)
)
/
(
√
(
vt
*
β
[
2
]
/
(
1
-
β2p
)
)
+
o
.
epsilon
)
*
η
βp
.=
βp
.*
β
return
Δ
end
"""
AdamW(η = 0.001, β::Tuple = (0.9, 0.999), decay = 0)
[AdamW](https://arxiv.org/abs/1711.05101) is a variant of Adam fixing (as in repairing) its
weight decay regularization.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
- `decay`: Decay applied to weights during optimisation.
# Examples
```julia
opt = AdamW()
opt = AdamW(0.001, (0.89, 0.995), 0.1)
```
"""
AdamW
(
η
=
0.001
,
β
=
(
0.9
,
0.999
)
,
decay
=
0
)
=
Optimiser
(
Adam
(
η
,
β
)
,
WeightDecay
(
decay
)
)
"""
AdaBelief(η = 0.001, β::Tuple = (0.9, 0.999), ϵ =
$
EPS
)
The [AdaBelief](https://arxiv.org/abs/2010.07468) optimiser is a variant of the well-known
Adam optimiser.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- Decay of momentums (`β::Tuple`): Exponential decay for the first (β1) and the
second (β2) momentum estimate.
# Examples
```julia
opt = AdaBelief()
opt = AdaBelief(0.001, (0.9, 0.8))
```
"""
mutable
struct
AdaBelief
<:
AbstractOptimiser
eta
::
Float64
beta
::
Tuple
{
Float64
,
Float64
}
epsilon
::
Float64
state
::
IdDict
{
Any
,
Any
}
end
AdaBelief
(
η
::
Real
=
0.001
,
β
=
(
0.9
,
0.999
)
,
ϵ
::
Real
=
EPS
)
=
AdaBelief
(
η
,
β
,
ϵ
,
IdDict
(
)
)
AdaBelief
(
η
::
Real
,
β
::
Tuple
,
state
::
IdDict
)
=
AdaBelief
(
η
,
β
,
EPS
,
state
)
function
apply!
(
o
::
AdaBelief
,
x
,
Δ
)
η
,
β
=
o
.
eta
,
o
.
beta
mt
,
st
,
βp
=
get!
(
o
.
state
,
x
)
do
(
zero
(
x
)
,
zero
(
x
)
,
Float64
[
β
[
1
]
,
β
[
2
]
]
)
end
::
Tuple
{
typeof
(
x
)
,
typeof
(
x
)
,
Vector
{
Float64
}
}
#= st is a variance and can go to zero. This is in contrast to Adam, which uses the
second moment which is usually far enough from zero. This is problematic, since st
can be slightly negative due to numerical error, and the square root below will fail.
Also, if we want to differentiate through the optimiser, √0 is not differentiable.
To protect against this, we add a small number, st -> st + eps2.
The original implementation (https://github.com/juntang-zhuang/Adabelief-Optimizer)
uses the square of Adam's epsilon, which we do here.
See also: https://github.com/juntang-zhuang/Adabelief-Optimizer/issues/61 =#
eps2
=
o
.
epsilon
^
2
# TODO: make epsilon^2 the default in next breaking release
@
.
mt
=
β
[
1
]
*
mt
+
(
1
-
β
[
1
]
)
*
Δ
@
.
st
=
β
[
2
]
*
st
+
(
1
-
β
[
2
]
)
*
(
Δ
-
mt
)
*
conj
(
Δ
-
mt
)
+
eps2
@
.
Δ
=
η
*
mt
/
(
1
-
βp
[
1
]
)
/
(
√
(
st
/
(
1
-
βp
[
2
]
)
)
+
eps2
)
βp
.=
βp
.*
β
return
Δ
end
Compose optimisers
"""
Optimiser(a, b, c...)
Combine several optimisers into one; each optimiser produces a modified gradient
that will be fed into the next, and this is finally applied to the parameter as
usual.
!!! note
This will be replaced by `Optimisers.OptimiserChain` in Flux 0.14.
"""
mutable
struct
Optimiser
<:
AbstractOptimiser
os
::
Vector
{
Any
}
end
Optimiser
(
opts
::
AbstractOptimiser
...
)
=
Optimiser
(
Any
[
opts
...
]
)
@
forward
Optimiser
.
os
Base
.
getindex
,
Base
.
first
,
Base
.
last
,
Base
.
lastindex
,
Base
.
push!
,
Base
.
setindex!
@
forward
Optimiser
.
os
Base
.
iterate
Base
.
getindex
(
c
::
Optimiser
,
i
::
AbstractArray
)
=
Optimiser
(
c
.
os
[
i
]
...
)
function
apply!
(
o
::
Optimiser
,
x
,
Δ
)
for
opt
in
o
.
os
Δ
=
apply!
(
opt
,
x
,
Δ
)
end
return
Δ
end
"""
InvDecay(γ = 0.001)
Apply inverse time decay to an optimiser, so that the effective step size at
iteration `n` is `eta / (1 + γ * n)` where `eta` is the initial step size.
The wrapped optimiser's step size is not modified.
See also the [Scheduling Optimisers](@ref) section of the docs
for more general scheduling techniques.
# Examples
`InvDecay` is typically composed with other optimisers
as the last transformation of the gradient:
```julia
# Inverse decay of the learning rate
# with starting value 0.001 and decay coefficient 0.01.
opt = Optimiser(Adam(1f-3), InvDecay(1f-2))
```
"""
mutable
struct
InvDecay
<:
AbstractOptimiser
gamma
::
Float64
state
::
IdDict
{
Any
,
Int
}
end
InvDecay
(
γ
=
0.001
)
=
InvDecay
(
γ
,
IdDict
{
Any
,
Int
}
(
)
)
function
apply!
(
o
::
InvDecay
,
x
,
Δ
)
γ
=
o
.
gamma
n
=
get!
(
o
.
state
,
x
,
1
)
Δ
.*=
1
/
(
1
+
γ
*
n
)
o
.
state
[
x
]
=
n
+
1
return
Δ
end
"""
ExpDecay(η = 0.001, decay = 0.1, decay_step = 1000, clip = 1e-4, start = 1)
Discount the learning rate `η` by the factor `decay` every `decay_step` steps till
a minimum of `clip`.
# Parameters
- Learning rate (`η`): Amount by which gradients are discounted before updating
the weights.
- `decay`: Factor by which the learning rate is discounted.
- `decay_step`: Schedule decay operations by setting the number of steps between
two decay operations.
- `clip`: Minimum value of learning rate.
- 'start': Step at which the decay starts.
See also the [Scheduling Optimisers](@ref) section of the docs
for more general scheduling techniques.
# Examples
`ExpDecay` is typically composed with other optimisers
as the last transformation of the gradient:
```julia
opt = Optimiser(Adam(), ExpDecay(1.0))
```
Note: you may want to start with `η=1` in `ExpDecay` when combined with other
optimisers (`Adam` in this case) that have their own learning rate.
"""
mutable
struct
ExpDecay
<:
AbstractOptimiser
eta
::
Float64
decay
::
Float64
step
::
Int64
clip
::
Float64
start
::
Int64
current
::
IdDict
end
ExpDecay
(
opt
=
0.001
,
decay
=
0.1
,
decay_step
=
1000
,
clip
=
1e-4
,
start
=
0
)
=
ExpDecay
(
opt
,
decay
,
decay_step
,
clip
,
start
,
IdDict
(
)
)
function
apply!
(
o
::
ExpDecay
,
x
,
Δ
)
η
,
s
,
decay
,
start
=
o
.
eta
,
o
.
step
,
o
.
decay
,
o
.
start
n
=
o
.
current
[
x
]
=
get
(
o
.
current
,
x
,
0
)
+
1
if
n
>
start
&&
n
%
s
==
0
&&
count
(
x
->
x
>
start
&&
x
%
s
==
0
,
values
(
o
.
current
)
)
==
1
η
=
max
(
η
*
decay
,
o
.
clip
)
o
.
eta
=
η
end
@
.
Δ
*=
η
end
"""
WeightDecay(λ = 0)
Decay weights by ``λ``.
Typically composed with other optimisers as the first transformation to the gradient,
making it equivalent to adding ``L_2`` regularization
with coefficient ``λ`` to the loss.
# Examples
```julia
opt = Optimiser(WeightDecay(1f-4), Adam())
```
"""
mutable
struct
WeightDecay
<:
AbstractOptimiser
wd
::
Real
end
WeightDecay
(
)
=
WeightDecay
(
0
)
function
apply!
(
o
::
WeightDecay
,
x
,
Δ
)
wd
=
o
.
wd
@
.
Δ
+=
wd
*
x
end
"""
ClipValue(thresh)
Clip gradients when their absolute value exceeds `thresh`.
!!! note
This will be replaced by `Optimisers.ClipGrad` in Flux 0.14.
"""
mutable
struct
ClipValue
{
T
}
<:
AbstractOptimiser
thresh
::
T
end
apply!
(
o
::
ClipValue
,
x
,
Δ
)
=
clamp!
(
Δ
,
-
o
.
thresh
,
o
.
thresh
)
"""
ClipNorm(thresh)
Clip gradients when their L2 norm exceeds `thresh`.
"""
mutable
struct
ClipNorm
{
T
}
<:
AbstractOptimiser
thresh
::
T
end
function
apply!
(
o
::
ClipNorm
,
x
,
Δ
)
Δnrm
=
norm
(
Δ
)
if
Δnrm
>
o
.
thresh
rmul!
(
Δ
,
o
.
thresh
/
Δnrm
)
end
return
Δ
end