MLUtils
mapobs
struct
MappedData
{
batched
,
F
,
D
}
<:
AbstractDataContainer
f
::
F
data
::
D
end
function
Base
.
show
(
io
::
IO
,
data
::
MappedData
{
batched
}
)
where
{
batched
}
print
(
io
,
"
mapobs(
"
)
print
(
IOContext
(
io
,
:
compact
=>
true
)
,
data
.
f
)
print
(
io
,
"
,
"
)
print
(
IOContext
(
io
,
:
compact
=>
true
)
,
data
.
data
)
print
(
io
,
"
; batched=:
$
(
batched
)
)
"
)
end
Base
.
length
(
data
::
MappedData
)
=
numobs
(
data
.
data
)
Base
.
getindex
(
data
::
MappedData
,
::
Colon
)
=
data
[
1
:
length
(
data
)
]
Base
.
getindex
(
data
::
MappedData
{
:
auto
}
,
idx
::
Int
)
=
data
.
f
(
getobs
(
data
.
data
,
idx
)
)
Base
.
getindex
(
data
::
MappedData
{
:
auto
}
,
idxs
::
AbstractVector
)
=
data
.
f
(
getobs
(
data
.
data
,
idxs
)
)
Base
.
getindex
(
data
::
MappedData
{
:
never
}
,
idx
::
Int
)
=
data
.
f
(
getobs
(
data
.
data
,
idx
)
)
Base
.
getindex
(
data
::
MappedData
{
:
never
}
,
idxs
::
AbstractVector
)
=
[
data
.
f
(
getobs
(
data
.
data
,
idx
)
)
for
idx
in
idxs
]
Base
.
getindex
(
data
::
MappedData
{
:
always
}
,
idx
::
Int
)
=
getobs
(
data
.
f
(
getobs
(
data
.
data
,
[
idx
]
)
)
,
1
)
Base
.
getindex
(
data
::
MappedData
{
:
always
}
,
idxs
::
AbstractVector
)
=
data
.
f
(
getobs
(
data
.
data
,
idxs
)
)
"""
mapobs(f, data; batched=:auto)
Lazily map `f` over the observations in a data container `data`.
Returns a new data container `mdata` that can be indexed and has a length.
Indexing triggers the transformation `f`.
The batched keyword argument controls the behavior of `mdata[idx]` and `mdata[idxs]`
where `idx` is an integer and `idxs` is a vector of integers:
- `batched=:auto` (default). Let `f` handle the two cases.
Calls `f(getobs(data, idx))` and `f(getobs(data, idxs))`.
- `batched=:never`. The function `f` is always called on a single observation.
Calls `f(getobs(data, idx))` and `[f(getobs(data, idx)) for idx in idxs]`.
- `batched=:always`. The function `f` is always called on a batch of observations.
Calls `getobs(f(getobs(data, [idx])), 1)` and `f(getobs(data, idxs))`.
# Examples
```julia
julia> data = (a=[1,2,3], b=[1,2,3]);
julia> mdata = mapobs(data) do x
(c = x.a .+ x.b, d = x.a .- x.b)
end
mapobs(#25, (a = [1, 2, 3], b = [1, 2, 3]); batched=:auto))
julia> mdata[1]
(c = 2, d = 0)
julia> mdata[1:2]
(c = [2, 4], d = [0, 0])
```
"""
mapobs
(
f
::
F
,
data
::
D
;
batched
=
:
auto
)
where
{
F
,
D
}
=
MappedData
{
batched
,
F
,
D
}
(
f
,
data
)
"""
mapobs(fs, data)
Lazily map each function in tuple `fs` over the observations in data container `data`.
Returns a tuple of transformed data containers.
"""
mapobs
(
fs
::
Tuple
,
data
)
=
Tuple
(
mapobs
(
f
,
data
)
for
f
in
fs
)
struct
NamedTupleData
{
TData
,
F
}
<:
AbstractDataContainer
data
::
TData
namedfs
::
NamedTuple
{
F
}
end
Base
.
length
(
data
::
NamedTupleData
)
=
numobs
(
getfield
(
data
,
:
data
)
)
function
Base
.
getindex
(
data
::
NamedTupleData
{
TData
,
F
}
,
idx
::
Int
)
where
{
TData
,
F
}
obs
=
getobs
(
getfield
(
data
,
:
data
)
,
idx
)
namedfs
=
getfield
(
data
,
:
namedfs
)
return
NamedTuple
{
F
}
(
f
(
obs
)
for
f
in
namedfs
)
end
Base
.
getproperty
(
data
::
NamedTupleData
,
field
::
Symbol
)
=
mapobs
(
getproperty
(
getfield
(
data
,
:
namedfs
)
,
field
)
,
getfield
(
data
,
:
data
)
)
Base
.
show
(
io
::
IO
,
data
::
NamedTupleData
)
=
print
(
io
,
"
mapobs(
$
(
getfield
(
data
,
:
namedfs
)
)
,
$
(
getfield
(
data
,
:
data
)
)
)
"
)
"""
mapobs(namedfs::NamedTuple, data)
Map a `NamedTuple` of functions over `data`, turning it into a data container
of `NamedTuple`s. Field syntax can be used to select a column of the resulting
data container.
```julia
data = 1:10
nameddata = mapobs((x = sqrt, y = log), data)
getobs(nameddata, 10) == (x = sqrt(10), y = log(10))
getobs(nameddata.x, 10) == sqrt(10)
```
"""
function
mapobs
(
namedfs
::
NamedTuple
,
data
)
return
NamedTupleData
(
data
,
namedfs
)
end
filterobs
"""
filterobs(f, data)
Return a subset of data container `data` including all indices `i` for
which `f(getobs(data, i)) === true`.
```julia
data = 1:10
numobs(data) == 10
fdata = filterobs(>(5), data)
numobs(fdata) == 5
```
"""
function
filterobs
(
f
,
data
;
iterfn
=
_iterobs
)
return
obsview
(
data
,
[
i
for
(
i
,
obs
)
in
enumerate
(
iterfn
(
data
)
)
if
f
(
obs
)
]
)
end
_iterobs
(
data
)
=
[
getobs
(
data
,
i
)
for
i
=
1
:
numobs
(
data
)
]
groupobs
"""
groupobs(f, data)
Split data container data `data` into different data containers, grouping
observations by `f(obs)`.
```julia
data = -10:10
datas = groupobs(>(0), data)
length(datas) == 2
```
"""
function
groupobs
(
f
,
data
)
groups
=
Dict
{
Any
,
Vector
{
Int
}
}
(
)
for
i
=
1
:
numobs
(
data
)
group
=
f
(
getobs
(
data
,
i
)
)
if
!
haskey
(
groups
,
group
)
groups
[
group
]
=
[
i
]
else
push!
(
groups
[
group
]
,
i
)
end
end
return
Dict
(
group
=>
obsview
(
data
,
idxs
)
for
(
group
,
idxs
)
in
groups
)
end
joinumobs
struct
JoinedData
{
T
,
N
}
<:
AbstractDataContainer
datas
::
NTuple
{
N
,
T
}
ns
::
NTuple
{
N
,
Int
}
end
JoinedData
(
datas
)
=
JoinedData
(
datas
,
numobs
.
(
datas
)
)
Base
.
length
(
data
::
JoinedData
)
=
sum
(
data
.
ns
)
function
Base
.
getindex
(
data
::
JoinedData
,
idx
)
for
(
i
,
n
)
in
enumerate
(
data
.
ns
)
if
idx
<=
n
return
getobs
(
data
.
datas
[
i
]
,
idx
)
else
idx
-=
n
end
end
end
"""
joinobs(datas...)
Concatenate data containers `datas`.
```julia
data1, data2 = 1:10, 11:20
jdata = joinumobs(data1, data2)
getobs(jdata, 15) == 15
```
"""
joinobs
(
datas
...
)
=
JoinedData
(
datas
)
"""
shuffleobs([rng], data)
Return a "subset" of `data` that spans all observations, but
has the order of the observations shuffled.
The values of `data` itself are not copied. Instead only the
indices are shuffled. This function calls [`obsview`](@ref) to
accomplish that, which means that the return value is likely of a
different type than `data`.
```julia
# For Arrays the subset will be of type SubArray
@assert typeof(shuffleobs(rand(4,10))) <: SubArray
# Iterate through all observations in random order
for x in eachobs(shuffleobs(X))
...
end
```
The optional parameter `rng` allows one to specify the
random number generator used for shuffling. This is useful when
reproducible results are desired. By default, uses the global RNG.
See `Random` in Julia's standard library for more info.
For this function to work, the type of `data` must implement
[`numobs`](@ref) and [`getobs`](@ref). See [`ObsView`](@ref)
for more information.
"""
shuffleobs
(
data
)
=
shuffleobs
(
Random
.
GLOBAL_RNG
,
data
)
function
shuffleobs
(
rng
::
AbstractRNG
,
data
)
obsview
(
data
,
randperm
(
rng
,
numobs
(
data
)
)
)
end