# functions.jl

Flux/losses/functions.jl is a source file in module `Flux`

In this file, doctests which differ in the printed Float32 values won't fail

``````

```
@meta

DocTestFilters = r"[0-9\.]+f0"

```

"""

mae(ŷ, y; agg = mean)

Return the loss corresponding to mean absolute error:

agg(abs.(ŷ .- y))

# Example

```jldoctest

julia> y_model = [1.1, 1.9, 3.1];

julia> Flux.mae(y_model, 1:3)

0.10000000000000009

```

"""

function

mae
(
ŷ
,

y

;

agg

=

mean
)

_check_sizes
(
ŷ
,

y
)

agg
(

abs
.

(

ŷ

.-

y
)
)

end

"""

mse(ŷ, y; agg = mean)

Return the loss corresponding to mean square error:

agg((ŷ .- y) .^ 2)

# Example

```jldoctest

julia> y_model = [1.1, 1.9, 3.1];

julia> y_true = 1:3;

julia> Flux.mse(y_model, y_true)

0.010000000000000018

```

"""

function

mse
(
ŷ
,

y

;

agg

=

mean
)

_check_sizes
(
ŷ
,

y
)

agg
(

abs2
.

(

ŷ

.-

y
)
)

end

"""

msle(ŷ, y; agg = mean, eps = eps(eltype(ŷ)))

The loss corresponding to mean squared logarithmic errors, calculated as

agg((log.(ŷ .+ ϵ) .- log.(y .+ ϵ)) .^ 2)

The `ϵ == eps` term provides numerical stability.

Penalizes an under-estimation more than an over-estimatation.

# Example

```jldoctest

julia> Flux.msle(Float32[1.1, 2.2, 3.3], 1:3)

0.009084041f0

julia> Flux.msle(Float32[0.9, 1.8, 2.7], 1:3)

0.011100831f0

```

"""

function

msle
(
ŷ
,

y

;

agg

=

mean
,

eps
::
Real

=

epseltype
(
ŷ
)
,

ϵ

=

nothing
)

ϵ

=

_greek_ascii_depwarn
(

ϵ

=>

eps
,

:

msle
,

"
ϵ
"

=>

"
eps
"
)

_check_sizes
(
ŷ
,

y
)

agg
(

(

log
.

(

(

ŷ

.+

ϵ
)

./

(

y

.+

ϵ
)
)
)

.^
2

)

end

"""

huber_loss(ŷ, y; delta = 1, agg = mean)

Return the mean of the [Huber loss](https://en.wikipedia.org/wiki/Huber_loss)

given the prediction `ŷ` and true values `y`.

| 0.5 * |ŷ - y|^2,            for |ŷ - y| <= δ

Huber loss = |

|  δ * (|ŷ - y| - 0.5 * δ), otherwise

# Example

```jldoctest

julia> ŷ = [1.1, 2.1, 3.1];

julia> Flux.huber_loss(ŷ, 1:3)  # default δ = 1 > |ŷ - y|

0.005000000000000009

julia> Flux.huber_loss(ŷ, 1:3, delta=0.05)  # changes behaviour as |ŷ - y| > δ

0.003750000000000005

```

"""

function

huber_loss
(
ŷ
,

y

;

agg

=

mean
,

delta
::
Real

=

1
,

δ

=

nothing
)

delta_tmp

=

_greek_ascii_depwarn
(

δ

=>

delta
,

:

huber_loss
,

"
δ
"

=>

"
delta
"
)

δ

=

ofeltype
(
ŷ
,

delta_tmp
)

_check_sizes
(
ŷ
,

y
)

abs_error

=

abs
.

(

ŷ

.-

y
)

#TODO: remove ignore_derivatives when Zygote can handle this function with CuArrays

temp

=

Zygote
.

ignore_derivatives
(

abs_error

.<

δ
)

x

=

ofeltype
(
ŷ
,

0.5
)

agg
(

(

(

abs_error

.^

2
)

.*

temp
)

.*

x

.+

δ

*

(

abs_error

.-

x

*

δ
)

.*

(

1

.-

temp
)
)

end

"""

label_smoothing(y::Union{Number, AbstractArray}, α; dims::Int=1)

Returns smoothed labels, meaning the confidence on label values are relaxed.

When `y` is given as one-hot vector or batch of one-hot, its calculated as

y .* (1 - α) .+ α / size(y, dims)

when `y` is given as a number or batch of numbers for binary classification,

its calculated as

y .* (1 - α) .+ α / 2

in which case the labels are squeezed towards `0.5`.

α is a number in interval (0, 1) called the smoothing factor. Higher the

value of α larger the smoothing of `y`.

`dims` denotes the one-hot dimension, unless `dims=0` which denotes the application

of label smoothing to binary distributions encoded in a single number.

# Example

```jldoctest

julia> y = Flux.onehotbatch([1, 1, 1, 0, 1, 0], 0:1)

2×6 OneHotMatrix(::Vector{UInt32}) with eltype Bool:

⋅  ⋅  ⋅  1  ⋅  1

1  1  1  ⋅  1  ⋅

julia> y_smoothed = Flux.label_smoothing(y, 0.2f0)

2×6 Matrix{Float32}:

0.1  0.1  0.1  0.9  0.1  0.9

0.9  0.9  0.9  0.1  0.9  0.1

julia> y_sim = softmax(y .* log(2f0))

2×6 Matrix{Float32}:

0.333333  0.333333  0.333333  0.666667  0.333333  0.666667

0.666667  0.666667  0.666667  0.333333  0.666667  0.333333

julia> y_dis = vcat(y_sim[2,:]', y_sim[1,:]')

2×6 Matrix{Float32}:

0.666667  0.666667  0.666667  0.333333  0.666667  0.333333

0.333333  0.333333  0.333333  0.666667  0.333333  0.666667

julia> Flux.crossentropy(y_sim, y) < Flux.crossentropy(y_sim, y_smoothed)

true

julia> Flux.crossentropy(y_dis, y) > Flux.crossentropy(y_dis, y_smoothed)

true

```

"""

function

label_smoothing
(

y
::

Union
{
AbstractArray
,
Number
}
,

α
::
Number

;

dims
::
Int

=

1
)

if

!
(

0

<

α

<

1
)

throw
(

ArgumentError
(

"
α must be between 0 and 1
"
)
)

end

if

dims

==

0

y_smoothed

=

y

.*

(

1

-

α
)

.+

α
*

1
//
2

elseif

dims

==

1

y_smoothed

=

y

.*

(

1

-

α
)

.+

α
*

1

//

size
(
y
,

1
)

else

throw
(

ArgumentError
(

"
`dims` should be either 0 or 1
"
)
)

end

return

y_smoothed

end

"""

crossentropy(ŷ, y; dims = 1, eps = eps(eltype(ŷ)), agg = mean)

Return the cross entropy between the given probability distributions;

calculated as

agg(-sum(y .* log.(ŷ .+ ϵ); dims))

Cross entropy is typically used as a loss in multi-class classification,

in which case the labels `y` are given in a one-hot format.

`dims` specifies the dimension (or the dimensions) containing the class probabilities.

The prediction `ŷ` is supposed to sum to one across `dims`,

as would be the case with the output of a [softmax](@ref Softmax) operation.

For numerical stability, it is recommended to use [`logitcrossentropy`](@ref)

rather than `softmax` followed by `crossentropy` .

Use [`label_smoothing`](@ref) to smooth the true labels as preprocessing before

computing the loss.

# Example

```jldoctest

julia> y_label = Flux.onehotbatch([0, 1, 2, 1, 0], 0:2)

3×5 OneHotMatrix(::Vector{UInt32}) with eltype Bool:

1  ⋅  ⋅  ⋅  1

⋅  1  ⋅  1  ⋅

⋅  ⋅  1  ⋅  ⋅

julia> y_model = softmax(reshape(-7:7, 3, 5) .* 1f0)

3×5 Matrix{Float32}:

0.0900306  0.0900306  0.0900306  0.0900306  0.0900306

0.244728   0.244728   0.244728   0.244728   0.244728

0.665241   0.665241   0.665241   0.665241   0.665241

julia> sum(y_model; dims=1)

1×5 Matrix{Float32}:

1.0  1.0  1.0  1.0  1.0

julia> Flux.crossentropy(y_model, y_label)

1.6076053f0

julia> 5 * ans ≈ Flux.crossentropy(y_model, y_label; agg=sum)

true

julia> y_smooth = Flux.label_smoothing(y_label, 0.15f0)

3×5 Matrix{Float32}:

0.9   0.05  0.05  0.05  0.9

0.05  0.9   0.05  0.9   0.05

0.05  0.05  0.9   0.05  0.05

julia> Flux.crossentropy(y_model, y_smooth)

1.5776052f0

```

"""

function

crossentropy
(
ŷ
,

y

;

dims

=

1
,

agg

=

mean
,

eps
::
Real

=

epseltype
(
ŷ
)
,

ϵ

=

nothing
)

ϵ

=

_greek_ascii_depwarn
(

ϵ

=>

eps
,

:

crossentropy
,

"
ϵ
"

=>

"
eps
"
)

_check_sizes
(
ŷ
,

y
)

agg
(

.-

sum
(

xlogy
.

(
y
,

ŷ

.+

ϵ
)

;

dims

=

dims
)
)

end

"""

logitcrossentropy(ŷ, y; dims = 1, agg = mean)

Return the cross entropy calculated by

agg(-sum(y .* logsoftmax(ŷ; dims); dims))

This is mathematically equivalent to `crossentropy(softmax(ŷ), y)`,

but is more numerically stable than using functions [`crossentropy`](@ref)

and [softmax](@ref Softmax) separately.

# Example

```jldoctest

julia> y_label = Flux.onehotbatch(collect("abcabaa"), 'a':'c')

3×7 OneHotMatrix(::Vector{UInt32}) with eltype Bool:

1  ⋅  ⋅  1  ⋅  1  1

⋅  1  ⋅  ⋅  1  ⋅  ⋅

⋅  ⋅  1  ⋅  ⋅  ⋅  ⋅

julia> y_model = reshape(vcat(-9:0, 0:9, 7.5f0), 3, 7)

3×7 Matrix{Float32}:

-9.0  -6.0  -3.0  0.0  2.0  5.0  8.0

-8.0  -5.0  -2.0  0.0  3.0  6.0  9.0

-7.0  -4.0  -1.0  1.0  4.0  7.0  7.5

julia> Flux.logitcrossentropy(y_model, y_label)

1.5791205f0

julia> Flux.crossentropy(softmax(y_model), y_label)

1.5791197f0

```

"""

function

logitcrossentropy
(
ŷ
,

y

;

dims

=

1
,

agg

=

mean
)

_check_sizes
(
ŷ
,

y
)

agg
(

.-

sum
(

y

.*

logsoftmax
(
ŷ

;

dims

=

dims
)

;

dims

=

dims
)
)

end

"""

binarycrossentropy(ŷ, y; agg = mean, eps = eps(eltype(ŷ)))

Return the binary cross-entropy loss, computed as

agg(@.(-y * log(ŷ + ϵ) - (1 - y) * log(1 - ŷ + ϵ)))

Where typically, the prediction `ŷ` is given by the output of a [sigmoid](@ref man-activations) activation.

The `ϵ == eps` term is included to avoid infinity. Using [`logitbinarycrossentropy`](@ref) is recomended

over `binarycrossentropy` for numerical stability.

Use [`label_smoothing`](@ref) to smooth the `y` value as preprocessing before

computing the loss.

# Examples

```jldoctest

julia> y_bin = Bool[1,0,1]

3-element Vector{Bool}:

1

0

1

julia> y_prob = softmax(reshape(vcat(1:3, 3:5), 2, 3) .* 1f0)

2×3 Matrix{Float32}:

0.268941  0.5  0.268941

0.731059  0.5  0.731059

julia> Flux.binarycrossentropy(y_prob[2,:], y_bin)

0.43989f0

julia> all(p -> 0 < p < 1, y_prob[2,:])  # else DomainError

true

julia> y_hot = Flux.onehotbatch(y_bin, 0:1)

2×3 OneHotMatrix(::Vector{UInt32}) with eltype Bool:

⋅  1  ⋅

1  ⋅  1

julia> Flux.crossentropy(y_prob, y_hot)

0.43989f0

```

"""

function

binarycrossentropy
(
ŷ
,

y

;

agg

=

mean
,

eps
::
Real

=

epseltype
(
ŷ
)
,

ϵ

=

nothing
)

ϵ

=

_greek_ascii_depwarn
(

ϵ

=>

eps
,

:

binarycrossentropy
,

"
ϵ
"

=>

"
eps
"
)

_check_sizes
(
ŷ
,

y
)

agg
(

@
.
(

-

xlogy
(
y
,

ŷ

+

ϵ
)

-

xlogy
(

1

-

y
,

1

-

ŷ

+

ϵ
)
)
)

end

"""

logitbinarycrossentropy(ŷ, y; agg = mean)

Mathematically equivalent to

[`binarycrossentropy(σ(ŷ), y)`](@ref) but is more numerically stable.

# Examples

```jldoctest

julia> y_bin = Bool[1,0,1];

julia> y_model = Float32[2, -1, pi]

3-element Vector{Float32}:

2.0

-1.0

3.1415927

julia> Flux.logitbinarycrossentropy(y_model, y_bin)

0.160832f0

julia> Flux.binarycrossentropy(sigmoid.(y_model), y_bin)

0.16083185f0

```

"""

function

logitbinarycrossentropy
(
ŷ
,

y

;

agg

=

mean
)

_check_sizes
(
ŷ
,

y
)

agg
(

@
.
(

(

1

-

y
)

*

ŷ

-

logσ
(
ŷ
)
)
)

end

"""

kldivergence(ŷ, y; agg = mean, eps = eps(eltype(ŷ)))

Return the

[Kullback-Leibler divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)

between the given probability distributions.

The KL divergence is a measure of how much one probability distribution is different

from the other. It is always non-negative, and zero only when both the distributions are equal.

# Example

```jldoctest

julia> p1 = [1 0; 0 1]

2×2 Matrix{Int64}:

1  0

0  1

julia> p2 = fill(0.5, 2, 2)

2×2 Matrix{Float64}:

0.5  0.5

0.5  0.5

julia> Flux.kldivergence(p2, p1) ≈ log(2)

true

julia> Flux.kldivergence(p2, p1; agg = sum) ≈ 2log(2)

true

julia> Flux.kldivergence(p2, p2; eps = 0)  # about -2e-16 with the regulator

0.0

julia> Flux.kldivergence(p1, p2; eps = 0)  # about 17.3 with the regulator

Inf

```

"""

function

kldivergence
(
ŷ
,

y

;

dims

=

1
,

agg

=

mean
,

eps
::
Real

=

epseltype
(
ŷ
)
,

ϵ

=

nothing
)

ϵ

=

_greek_ascii_depwarn
(

ϵ

=>

eps
,

:

kldivergence
,

"
ϵ
"

=>

"
eps
"
)

_check_sizes
(
ŷ
,

y
)

entropy

=

agg
(

sum
(

xlogx
.

(
y
)

;

dims

=

dims
)
)

cross_entropy

=

crossentropy
(
ŷ
,

y

;

dims
,

agg
,

eps
=
ϵ
)

return

entropy

+

cross_entropy

end

"""

poisson_loss(ŷ, y; agg = mean)

Return how much the predicted distribution `ŷ` diverges from the expected Poisson

distribution `y`; calculated as -

sum(ŷ .- y .* log.(ŷ)) / size(y, 2)

# Example

```jldoctest

julia> y_model = [1, 3, 3];  # data should only take integral values

julia> Flux.poisson_loss(y_model, 1:3)

0.5023128522198171

```

"""

function

poisson_loss
(
ŷ
,

y

;

agg

=

mean
)

_check_sizes
(
ŷ
,

y
)

agg
(

ŷ

.-

xlogy
.

(
y
,

ŷ
)
)

end

"""

hinge_loss(ŷ, y; agg = mean)

Return the [hinge_loss](https://en.wikipedia.org/wiki/Hinge_loss) given the

prediction `ŷ` and true labels `y` (containing 1 or -1); calculated as

sum(max.(0, 1 .- ŷ .* y)) / size(y, 2)

Usually used with classifiers like Support Vector Machines.

# Example

```jldoctest

julia> y_true = [1, -1, 1, 1];

julia> y_pred = [0.1, 0.3, 1, 1.5];

julia> Flux.hinge_loss(y_pred, y_true)

0.55

julia> Flux.hinge_loss(y_pred[1], y_true[1]) != 0  # same sign but |ŷ| < 1

true

julia> Flux.hinge_loss(y_pred[end], y_true[end]) == 0  # same sign but |ŷ| >= 1

true

julia> Flux.hinge_loss(y_pred[2], y_true[2]) != 0 # opposite signs

true

```

"""

function

hinge_loss
(
ŷ
,

y

;

agg

=

mean
)

_check_sizes
(
ŷ
,

y
)

agg
(

max
.

(
0
,

1

.-

ŷ

.*

y
)
)

end

"""

squared_hinge_loss(ŷ, y)

Return the squared hinge_loss loss given the prediction `ŷ` and true labels `y`

(containing 1 or -1); calculated as

sum((max.(0, 1 .- ŷ .* y)).^2) / size(y, 2)

Usually used with classifiers like Support Vector Machines.

# Example

```jldoctes

julia> y_true = [1, -1, 1, 1];

julia> y_pred = [0.1, 0.3, 1, 1.5];

julia> Flux.squared_hinge_loss(y_pred, y_true)

0.625

julia> Flux.squared_hinge_loss(y_pred[1], y_true[1]) != 0

true

julia> Flux.squared_hinge_loss(y_pred[end], y_true[end]) == 0

true

julia> Flux.squared_hinge_loss(y_pred[2], y_true[2]) != 0

true

```

"""

function

squared_hinge_loss
(
ŷ
,

y

;

agg

=

mean
)

_check_sizes
(
ŷ
,

y
)

agg
(

(

max
.

(
0
,

1

.-

ŷ

.*

y
)
)

.^

2
)

end

"""

dice_coeff_loss(ŷ, y; smooth = 1)

Return a loss based on the dice coefficient.

Used in the [V-Net](https://arxiv.org/abs/1606.04797) image segmentation

architecture.

The dice coefficient is similar to the F1_score. Loss calculated as:

1 - 2*sum(|ŷ .* y| + smooth) / (sum(ŷ.^2) + sum(y.^2) + smooth)

# Example

```jldoctest

julia> y_pred = [1.1, 2.1, 3.1];

julia> Flux.dice_coeff_loss(y_pred, 1:3)

0.000992391663909964

julia> 1 - Flux.dice_coeff_loss(y_pred, 1:3)  # ~ F1 score for image segmentation

0.99900760833609

```

"""

function

dice_coeff_loss
(
ŷ
,

y

;

smooth

=

1
)

s

=

ofeltype
(
ŷ
,

smooth
)

_check_sizes
(
ŷ
,

y
)

1

-

(

2

*

sum
(

y

.*

ŷ
)

+

s
)

/

(

sum
(

y

.^

2
)

+

sum
(

ŷ

.^

2
)

+

s
)

end

"""

tversky_loss(ŷ, y; beta = 0.7)

Return the [Tversky loss](https://arxiv.org/abs/1706.05721).

Used with imbalanced data to give more weight to false negatives.

Larger `β == beta` weigh recall more than precision (by placing more emphasis on false negatives).

Calculated as:

1 - sum(|y .* ŷ| + 1) / (sum(y .* ŷ + (1 - β)*(1 .- y) .* ŷ + β*y .* (1 .- ŷ)) + 1)

"""

function

tversky_loss
(
ŷ
,

y

;

beta
::
Real

=

0.7
,

β

=

nothing
)

beta_temp

=

_greek_ascii_depwarn
(

β

=>

beta
,

:

tversky_loss
,

"
β
"

=>

"
beta
"
)

β

=

ofeltype
(
ŷ
,

beta_temp
)

_check_sizes
(
ŷ
,

y
)

num

=

sum
(

y

.*

ŷ
)

+

1

den

=

sum
(

y

.*

ŷ

+

β

*

(

1

.-

y
)

.*

ŷ

+

(

1

-

β
)

*

y

.*

(

1

.-

ŷ
)
)

+

1

1

-

num

/

den

end

"""

binary_focal_loss(ŷ, y; agg=mean, gamma=2, eps=eps(eltype(ŷ)))

Return the [binary_focal_loss](https://arxiv.org/pdf/1708.02002.pdf)

The input, 'ŷ', is expected to be normalized (i.e. [softmax](@ref Softmax) output).

For `gamma = 0`, the loss is mathematically equivalent to [`Losses.binarycrossentropy`](@ref).

# Example

```jldoctest

julia> y = [0  1  0

1  0  1]

2×3 Matrix{Int64}:

0  1  0

1  0  1

julia> ŷ = [0.268941  0.5  0.268941

0.731059  0.5  0.731059]

2×3 Matrix{Float64}:

0.268941  0.5  0.268941

0.731059  0.5  0.731059

julia> Flux.binary_focal_loss(ŷ, y) ≈ 0.0728675615927385

true

```

"""

function

binary_focal_loss
(
ŷ
,

y

;

agg
=
mean
,

gamma
=
2
,

eps
::
Real
=

epseltype
(
ŷ
)
,

ϵ

=

nothing
,

γ

=

nothing
)

ϵ

=

_greek_ascii_depwarn
(

ϵ

=>

eps
,

:

binary_focal_loss
,

"
ϵ
"

=>

"
eps
"
)

gamma_temp

=

_greek_ascii_depwarn
(

γ

=>

gamma
,

:

binary_focal_loss
,

"
γ
"

=>

"
gamma
"
)

γ

=

gamma_temp

isa

Integer

?

gamma_temp

:

ofeltype
(
ŷ
,

gamma_temp
)

_check_sizes
(
ŷ
,

y
)

ŷϵ

=

ŷ

.+

ϵ

p_t

=

y

.*

ŷϵ

+

(

1

.-

y
)

.*

(

1

.-

ŷϵ
)

ce

=

.-

log
.

(
p_t
)

weight

=

(

1

.-

p_t
)

.^

γ

loss

=

weight

.*

ce

agg
(
loss
)

end

"""

focal_loss(ŷ, y; dims=1, agg=mean, gamma=2, eps=eps(eltype(ŷ)))

Return the [focal_loss](https://arxiv.org/pdf/1708.02002.pdf)

which can be used in classification tasks with highly imbalanced classes.

It down-weights well-classified examples and focuses on hard examples.

The input, 'ŷ', is expected to be normalized (i.e. [softmax](@ref Softmax) output).

The modulating factor, `γ == gamma`, controls the down-weighting strength.

For `γ == 0`, the loss is mathematically equivalent to [`Losses.crossentropy`](@ref).

# Example

```jldoctest

julia> y = [1  0  0  0  1

0  1  0  1  0

0  0  1  0  0]

3×5 Matrix{Int64}:

1  0  0  0  1

0  1  0  1  0

0  0  1  0  0

julia> ŷ = softmax(reshape(-7:7, 3, 5) .* 1f0)

3×5 Matrix{Float32}:

0.0900306  0.0900306  0.0900306  0.0900306  0.0900306

0.244728   0.244728   0.244728   0.244728   0.244728

0.665241   0.665241   0.665241   0.665241   0.665241

julia> Flux.focal_loss(ŷ, y) ≈ 1.1277571935622628

true

```

"""

function

focal_loss
(
ŷ
,

y

;

dims
=
1
,

agg
=
mean
,

gamma
=
2
,

eps
::
Real
=

epseltype
(
ŷ
)
,

ϵ
=
nothing
,

γ
=
nothing
)

ϵ

=

_greek_ascii_depwarn
(

ϵ

=>

eps
,

:

focal_loss
,

"
ϵ
"

=>

"
eps
"
)

gamma_temp

=

_greek_ascii_depwarn
(

γ

=>

gamma
,

:

focal_loss
,

"
γ
"

=>

"
gamma
"
)

γ

=

gamma_temp

isa

Integer

?

gamma_temp

:

ofeltype
(
ŷ
,

gamma_temp
)

_check_sizes
(
ŷ
,

y
)

ŷϵ

=

ŷ

.+

ϵ

agg
(

sum
(

@
.

-
y

*

(

1

-

ŷϵ
)
^
γ

*

log
(
ŷϵ
)

;

dims
)
)

end

"""

siamese_contrastive_loss(ŷ, y; margin = 1, agg = mean)

which can be useful for training Siamese Networks. It is given by

agg(@. (1 - y) * ŷ^2 + y * max(0, margin - ŷ)^2)

Specify `margin` to set the baseline for distance at which pairs are dissimilar.

# Example

```jldoctest

julia> ŷ = [0.5, 1.5, 2.5];

julia> Flux.siamese_contrastive_loss(ŷ, 1:3)

-4.833333333333333

julia> Flux.siamese_contrastive_loss(ŷ, 1:3, margin = 2)

-4.0

```

"""

function

siamese_contrastive_loss
(
ŷ
,

y

;

agg

=

mean
,

margin
::
Real

=

1
)

_check_sizes
(
ŷ
,

y
)

margin

<

0

&&

throw
(

DomainError
(
margin
,

"
Margin must be non-negative
"
)
)

return

agg
(

@
.

(

1

-

y
)

*

ŷ
^
2

+

y

*

max
(
0
,

margin

-

ŷ
)
^
2
)

end

```
@meta

DocTestFilters = nothing

`````````