obstransform.jl

MLUtils/obstransform.jl is a source file in module MLUtils

			

mapobs


			
			
			
			
			struct
			
			 
			

	
			MappedData
			{
			batched
			,
			 
			F
			,
			 
			D
			}
			 
			<:
			 

	
			AbstractDataContainer
			
			
    
			
			f
			::
			F
			
    
			
			data
			::
			D
			

			end
			

			

			
			function
			 
			
			
			
			Base
			.
			
			show
			(
			
			io
			::
			IO
			,
			 
			
			data
			::
			

	
			MappedData
			{
			batched
			}
			)
			 
			where
			 
			{
			batched
			}
			
			
    
			
			print
			(
			io
			,
			 
			
			"
			mapobs(
			"
			)
			
    
			
			print
			(
			
			IOContext
			(
			io
			,
			 
			
			
			:
			compact
			=>
			true
			)
			,
			 
			
			data
			.
			
			f
			)
			
    
			
			print
			(
			io
			,
			 
			
			"
			, 
			"
			)
			
    
			
			print
			(
			
			IOContext
			(
			io
			,
			 
			
			
			:
			compact
			=>
			true
			)
			,
			 
			
			data
			.
			
			data
			)
			
    
			
			print
			(
			io
			,
			 
			
			"
			; batched=:
			$
			(
			batched
			)
			)
			"
			)
			

			end
			

			

			
			
			
			Base
			.
			
			length
			(
			
			data
			::

	
			MappedData
			)
			 
			=
			 
			

	
			numobs
			(
			
			data
			.
			
			data
			)
			

			
			
			
			Base
			.
			
			getindex
			(
			
			data
			::

	
			MappedData
			,
			 
			
			::
			Colon
			)
			 
			=
			 
			
			data
			[
			
			1
			:
			
			length
			(
			data
			)
			]
			

			

			
			
			
			Base
			.
			
			getindex
			(
			
			data
			::
			

	
			MappedData
			{
			
			:
			auto
			}
			,
			 
			
			idx
			::
			Int
			)
			 
			=
			 
			
			
			data
			.
			
			f
			(
			

	
			getobs
			(
			
			data
			.
			
			data
			,
			 
			idx
			)
			)
			

			
			
			
			Base
			.
			
			getindex
			(
			
			data
			::
			

	
			MappedData
			{
			
			:
			auto
			}
			,
			 
			
			idxs
			::
			AbstractVector
			)
			 
			=
			 
			
			
			data
			.
			
			f
			(
			

	
			getobs
			(
			
			data
			.
			
			data
			,
			 
			idxs
			)
			)
			

			

			
			
			
			Base
			.
			
			getindex
			(
			
			data
			::
			

	
			MappedData
			{
			
			:
			never
			}
			,
			 
			
			idx
			::
			Int
			)
			 
			=
			 
			
			
			data
			.
			
			f
			(
			

	
			getobs
			(
			
			data
			.
			
			data
			,
			 
			idx
			)
			)
			

			
			
			
			Base
			.
			
			getindex
			(
			
			data
			::
			

	
			MappedData
			{
			
			:
			never
			}
			,
			 
			
			idxs
			::
			AbstractVector
			)
			 
			=
			 
			
			[
			
			
			
			data
			.
			
			f
			(
			

	
			getobs
			(
			
			data
			.
			
			data
			,
			 
			idx
			)
			)
			 
			for
			
			 
			idx
			 
			in
			 
			idxs
			]
			

			

			
			
			
			Base
			.
			
			getindex
			(
			
			data
			::
			

	
			MappedData
			{
			
			:
			always
			}
			,
			 
			
			idx
			::
			Int
			)
			 
			=
			 
			

	
			getobs
			(
			
			
			data
			.
			
			f
			(
			

	
			getobs
			(
			
			data
			.
			
			data
			,
			 
			
			[
			idx
			]
			)
			)
			,
			 
			1
			)
			

			
			
			
			Base
			.
			
			getindex
			(
			
			data
			::
			

	
			MappedData
			{
			
			:
			always
			}
			,
			 
			
			idxs
			::
			AbstractVector
			)
			 
			=
			 
			
			
			data
			.
			
			f
			(
			

	
			getobs
			(
			
			data
			.
			
			data
			,
			 
			idxs
			)
			)
			

			

			

			
			
			
			"""
			

			    mapobs(f, data; batched=:auto)

			

			Lazily map `f` over the observations in a data container `data`.

			Returns a new data container `mdata` that can be indexed and has a length.

			Indexing triggers the transformation `f`.

			

			The batched keyword argument controls the behavior of `mdata[idx]` and `mdata[idxs]` 

			where `idx` is an integer and `idxs` is a vector of integers:

			- `batched=:auto` (default). Let `f` handle the two cases. 

			   Calls `f(getobs(data, idx))` and `f(getobs(data, idxs))`.

			- `batched=:never`. The function `f` is always called on a single observation. 

			   Calls `f(getobs(data, idx))` and `[f(getobs(data, idx)) for idx in idxs]`.

			- `batched=:always`. The function `f` is always called on a batch of observations.

			    Calls `getobs(f(getobs(data, [idx])), 1)` and `f(getobs(data, idxs))`.

			

			# Examples

			

			```julia

			julia> data = (a=[1,2,3], b=[1,2,3]);

			

			julia> mdata = mapobs(data) do x

			         (c = x.a .+ x.b,  d = x.a .- x.b)

			       end

			mapobs(#25, (a = [1, 2, 3], b = [1, 2, 3]); batched=:auto))

			

			julia> mdata[1]

			(c = 2, d = 0)

			

			julia> mdata[1:2]

			(c = [2, 4], d = [0, 0])

			```

			"""
			

			
			
			

	
			mapobs
			(
			
			f
			::
			F
			,
			 
			
			data
			::
			D
			
			;
			 
			
			batched
			=
			
			:
			auto
			)
			 
			where
			 
			{
			F
			,
			D
			}
			 
			=
			 
			
			

	
			MappedData
			{
			batched
			,
			 
			F
			,
			 
			D
			}
			(
			f
			,
			 
			data
			)
			

			

			
			
			
			"""
			

			    mapobs(fs, data)

			

			Lazily map each function in tuple `fs` over the observations in data container `data`.

			Returns a tuple of transformed data containers.

			"""
			

			
			

	
			mapobs
			(
			
			fs
			::
			Tuple
			,
			 
			data
			)
			 
			=
			 
			
			Tuple
			(
			
			

	
			mapobs
			(
			f
			,
			 
			data
			)
			 
			for
			
			 
			f
			 
			in
			 
			fs
			)
			

			

			

			
			
			struct
			
			 
			

	
			NamedTupleData
			{
			TData
			,
			F
			}
			 
			<:
			 

	
			AbstractDataContainer
			
			
    
			
			data
			::
			TData
			
    
			
			namedfs
			::
			
			NamedTuple
			{
			F
			}
			

			end
			

			

			
			
			
			Base
			.
			
			length
			(
			
			data
			::

	
			NamedTupleData
			)
			 
			=
			 
			

	
			numobs
			(
			
			getfield
			(
			data
			,
			 
			
			:
			data
			)
			)
			

			

			
			function
			 
			
			
			
			Base
			.
			
			getindex
			(
			
			data
			::
			

	
			NamedTupleData
			{
			TData
			,
			F
			}
			,
			 
			
			idx
			::
			Int
			)
			 
			where
			 
			{
			TData
			,
			F
			}
			
			
    
			
			obs
			 
			=
			 
			

	
			getobs
			(
			
			getfield
			(
			data
			,
			 
			
			:
			data
			)
			,
			 
			idx
			)
			
    
			
			namedfs
			 
			=
			 
			
			getfield
			(
			data
			,
			 
			
			:
			namedfs
			)
			
    
			
			return
			 
			
			
			NamedTuple
			{
			F
			}
			(
			
			
			f
			(
			obs
			)
			 
			for
			
			 
			f
			 
			in
			 
			namedfs
			)
			

			end
			

			

			
			
			
			Base
			.
			
			getproperty
			(
			
			data
			::

	
			NamedTupleData
			,
			 
			
			field
			::
			Symbol
			)
			 
			=
			
    
			

	
			mapobs
			(
			
			getproperty
			(
			
			getfield
			(
			data
			,
			 
			
			:
			namedfs
			)
			,
			 
			field
			)
			,
			 
			
			getfield
			(
			data
			,
			 
			
			:
			data
			)
			)
			

			

			
			
			
			Base
			.
			
			show
			(
			
			io
			::
			IO
			,
			 
			
			data
			::

	
			NamedTupleData
			)
			 
			=
			
    
			
			print
			(
			io
			,
			 
			
			"
			mapobs(
			$
			(
			
			getfield
			(
			data
			,
			 
			
			:
			namedfs
			)
			)
			, 
			$
			(
			
			getfield
			(
			data
			,
			 
			
			:
			data
			)
			)
			)
			"
			)
			

			

			
			
			
			"""
			

			    mapobs(namedfs::NamedTuple, data)

			

			Map a `NamedTuple` of functions over `data`, turning it into a data container

			of `NamedTuple`s. Field syntax can be used to select a column of the resulting

			data container.

			

			```julia

			data = 1:10

			nameddata = mapobs((x = sqrt, y = log), data)

			getobs(nameddata, 10) == (x = sqrt(10), y = log(10))

			getobs(nameddata.x, 10) == sqrt(10)

			```

			"""
			

			
			function
			 
			

	
			mapobs
			(
			
			namedfs
			::
			NamedTuple
			,
			 
			data
			)
			
			
    
			
			return
			 
			

	
			NamedTupleData
			(
			data
			,
			 
			namedfs
			)
			

			end

filterobs


			
			
			
			
			
			"""
			

			    filterobs(f, data)

			

			Return a subset of data container `data` including all indices `i` for

			which `f(getobs(data, i)) === true`.

			

			```julia

			data = 1:10

			numobs(data) == 10

			fdata = filterobs(>(5), data)

			numobs(fdata) == 5

			```

			"""
			

			
			function
			 
			

	
			filterobs
			(
			f
			,
			 
			data
			
			;
			 
			
			iterfn
			 
			=
			 
			_iterobs
			)
			
			
    
			
			return
			 
			

	
			obsview
			(
			data
			,
			 
			
			[
			
			i
			 
			for
			
			
			 
			
			(
			i
			,
			 
			obs
			)
			 
			in
			 
			
			enumerate
			(
			
			iterfn
			(
			data
			)
			)
			 
			if
			 
			
			f
			(
			obs
			)
			]
			)
			

			end
			

			

			
			
			_iterobs
			(
			data
			)
			 
			=
			 
			
			[
			
			

	
			getobs
			(
			data
			,
			 
			i
			)
			 
			for
			
			 
			i
			 
			=
			
			 
			1
			:
			

	
			numobs
			(
			data
			)
			]

groupobs


			
			
			
			
			
			"""
			

			    groupobs(f, data)

			

			Split data container data `data` into different data containers, grouping

			observations by `f(obs)`.

			

			```julia

			data = -10:10

			datas = groupobs(>(0), data)

			length(datas) == 2

			```

			"""
			

			
			function
			 
			

	
			groupobs
			(
			f
			,
			 
			data
			)
			
			
    
			
			groups
			 
			=
			 
			
			
			Dict
			{
			Any
			,
			
			Vector
			{
			Int
			}
			}
			(
			)
			
    
			
			for
			
			 
			i
			 
			=
			
			 
			1
			:
			

	
			numobs
			(
			data
			)
			
			
        
			
			group
			 
			=
			 
			
			f
			(
			

	
			getobs
			(
			data
			,
			 
			i
			)
			)
			
        
			
			if
			 
			
			!
			
			haskey
			(
			groups
			,
			 
			group
			)
			
			
            
			
			
			groups
			[
			group
			]
			 
			=
			 
			
			[
			i
			]
			
        
			else
			
			
            
			
			push!
			(
			
			groups
			[
			group
			]
			,
			 
			i
			)
			
        
			end
			
    
			end
			
    
			
			return
			 
			
			Dict
			(
			
			
			group
			 
			=>
			 
			

	
			obsview
			(
			data
			,
			 
			idxs
			)
			 
			for
			
			 
			
			(
			group
			,
			 
			idxs
			)
			 
			in
			 
			groups
			)
			

			end

joinumobs


			
			
			
			
			struct
			
			 
			

	
			JoinedData
			{
			T
			,
			N
			}
			 
			<:
			 

	
			AbstractDataContainer
			
			
    
			
			datas
			::
			
			NTuple
			{
			N
			,
			T
			}
			
    
			
			ns
			::
			
			NTuple
			{
			N
			,
			Int
			}
			

			end
			

			

			
			

	
			JoinedData
			(
			datas
			)
			 
			=
			 
			

	
			JoinedData
			(
			datas
			,
			 
			

	
			numobs
			.
			
			(
			datas
			)
			)
			

			

			
			
			
			Base
			.
			
			length
			(
			
			data
			::

	
			JoinedData
			)
			 
			=
			 
			
			sum
			(
			
			data
			.
			
			ns
			)
			

			

			
			function
			 
			
			
			Base
			.
			
			getindex
			(
			
			data
			::

	
			JoinedData
			,
			 
			idx
			)
			
			
    
			
			for
			
			 
			
			(
			i
			,
			 
			n
			)
			 
			in
			 
			
			enumerate
			(
			
			data
			.
			
			ns
			)
			
			
        
			
			if
			
			 
			idx
			 
			<=
			 
			n
			
			
            
			
			return
			 
			

	
			getobs
			(
			
			
			data
			.
			
			datas
			[
			i
			]
			,
			 
			idx
			)
			
        
			else
			
			
            
			
			idx
			 
			-=
			 
			n
			
        
			end
			
    
			end
			

			end
			

			

			
			
			
			"""
			

			    joinobs(datas...)

			

			Concatenate data containers `datas`.

			

			```julia

			data1, data2 = 1:10, 11:20

			jdata = joinumobs(data1, data2)

			getobs(jdata, 15) == 15

			```

			"""
			

			
			

	
			joinobs
			(
			
			datas
			...
			)
			 
			=
			 
			

	
			JoinedData
			(
			datas
			)
			

			

			
			
			
			"""
			

			    shuffleobs([rng], data)

			

			Return a "subset" of `data` that spans all observations, but

			has the order of the observations shuffled.

			

			The values of `data` itself are not copied. Instead only the

			indices are shuffled. This function calls [`obsview`](@ref) to

			accomplish that, which means that the return value is likely of a

			different type than `data`.

			

			```julia

			# For Arrays the subset will be of type SubArray

			@assert typeof(shuffleobs(rand(4,10))) <: SubArray

			

			# Iterate through all observations in random order

			for x in eachobs(shuffleobs(X))

			    ...

			end

			```

			

			The optional parameter `rng` allows one to specify the

			random number generator used for shuffling. This is useful when

			reproducible results are desired. By default, uses the global RNG.

			See `Random` in Julia's standard library for more info.

			

			For this function to work, the type of `data` must implement

			[`numobs`](@ref) and [`getobs`](@ref). See [`ObsView`](@ref)

			for more information.

			"""
			

			
			

	
			shuffleobs
			(
			data
			)
			 
			=
			 
			

	
			shuffleobs
			(
			
			Random
			.
			
			GLOBAL_RNG
			,
			 
			data
			)
			

			

			
			function
			 
			

	
			shuffleobs
			(
			
			rng
			::
			AbstractRNG
			,
			 
			data
			)
			
			
    
			

	
			obsview
			(
			data
			,
			 
			
			randperm
			(
			rng
			,
			 
			

	
			numobs
			(
			data
			)
			)
			)
			

			end