FastTabular
"""
EncodedTableRow{M, N} <: Block
Block for processed rows having a tuple of M categorical and
N continuous value collections.
"""
struct
EncodedTableRow
{
M
,
N
,
T
}
<:
Block
catcols
::
NTuple
{
M
}
contcols
::
NTuple
{
N
}
categorydict
::
T
end
function
EncodedTableRow
(
catcols
,
contcols
,
categorydict
)
EncodedTableRow
{
length
(
catcols
)
,
length
(
contcols
)
}
(
catcols
,
contcols
,
categorydict
)
end
function
checkblock
(
::
EncodedTableRow
{
M
,
N
}
,
x
::
Tuple
{
Vector
,
Vector
}
)
where
{
M
,
N
}
length
(
x
[
1
]
)
==
M
&&
length
(
x
[
2
]
)
==
N
end
function
showblock!
(
io
,
::
ShowText
,
block
::
EncodedTableRow
,
obs
)
print
(
io
,
"
EncodedTableRow(...)
"
)
end
"""
TabularPreprocessing <: Encoding
Encodes a `TableRow` by applying the following preprocessing steps:
- [`DataAugmentation.NormalizeRow`](#) (for normalizing a row of data for continuous columns)
- [`DataAugmentation.FillMissing`](#) (for filling missing values)
- [`DataAugmentation.Categorify`](#) (for label encoding categorical columns,
which can be later used for indexing into embedding matrices)
or a sequence of these transformations.
"""
struct
TabularPreprocessing
{
T
}
<:
Encoding
tfms
::
T
end
TabularPreprocessing
(
td
::
TableDataset
)
=
TabularPreprocessing
(
gettransforms
(
td
)
)
function
encodedblock
(
::
TabularPreprocessing
,
block
::
TableRow
)
EncodedTableRow
(
block
.
catcols
,
block
.
contcols
,
block
.
categorydict
)
end
function
encode
(
tt
::
TabularPreprocessing
,
_
,
block
::
TableRow
,
row
)
columns
=
Tables
.
columnnames
(
row
)
usedrow
=
NamedTuple
(
filter
(
x
->
x
[
1
]
∈
block
.
catcols
||
x
[
1
]
∈
block
.
contcols
,
collect
(
zip
(
columns
,
row
)
)
)
)
tfmrow
=
DataAugmentation
.
apply
(
tt
.
tfms
,
DataAugmentation
.
TabularItem
(
usedrow
,
keys
(
usedrow
)
)
)
.
data
catvals
=
collect
(
map
(
col
->
tfmrow
[
col
]
,
block
.
catcols
)
)
contvals
=
collect
(
map
(
col
->
tfmrow
[
col
]
,
block
.
contcols
)
)
(
catvals
,
contvals
)
end
function
setup
(
::
Type
{
TabularPreprocessing
}
,
block
::
TableRow
,
data
::
TableDataset
)
return
TabularPreprocessing
(
gettransforms
(
data
,
block
.
catcols
,
block
.
contcols
)
)
end
blockmodel
"""
blockmodel(inblock::TableRow{M, N}, outblock::Union{Continuous, OneHotTensor{0}}, backbone=nothing) where {M, N}
Contruct a model for tabular classification or regression. `backbone` should be a
NamedTuple of categorical, continuous, and a finalclassifier layer, with
the first two taking in batches of corresponding row value matrices.
"""
"""
blockmodel(::EncodedTableRow, ::OneHotTensor[, backbone])
Create a model for tabular classification. `backbone` should be named tuple
`(categorical = ..., continuous = ...)`. See [`TabularModel`](#) for more info.
"""
function
blockmodel
(
inblock
::
EncodedTableRow
,
outblock
::
OneHotTensor
{
0
}
,
backbone
)
TabularModel
(
backbone
.
categorical
,
backbone
.
continuous
,
Dense
(
100
,
length
(
outblock
.
classes
)
)
)
end
"""
blockmodel(::EncodedTableRow, ::Continuous[, backbone])
Create a model for tabular regression. `backbone` should be named tuple
`(categorical = ..., continuous = ...)`. See [`TabularModel`](#) for more info.
"""
function
blockmodel
(
inblock
::
EncodedTableRow
,
outblock
::
Continuous
,
backbone
)
TabularModel
(
backbone
.
categorical
,
backbone
.
continuous
,
Dense
(
100
,
outblock
.
size
)
)
end
function
blockbackbone
(
inblock
::
EncodedTableRow
{
M
,
N
}
)
where
{
M
,
N
}
embedszs
=
_get_emb_sz
(
collect
(
map
(
col
->
length
(
inblock
.
categorydict
[
col
]
)
,
inblock
.
catcols
)
)
)
catback
=
tabular_embedding_backbone
(
embedszs
)
contback
=
tabular_continuous_backbone
(
N
)
return
(
categorical
=
catback
,
continuous
=
contback
)
end
"""
The helper functions defined below can be used for quickly constructing a dictionary,
which will be required for creating various tabular transformations available in DataAugmentation.jl.
These functions assume that the table in the TableDataset object td has Tables.jl columnaccess interface defined.
"""
function
gettransformdict
(
td
,
::
Type
{
DataAugmentation
.
NormalizeRow
}
,
cols
)
dict
=
Dict
(
)
map
(
cols
)
do
col
vals
=
skipmissing
(
Tables
.
getcolumn
(
td
.
table
,
col
)
)
dict
[
col
]
=
(
Statistics
.
mean
(
vals
)
,
Statistics
.
std
(
vals
)
)
end
dict
end
function
gettransformdict
(
td
,
::
Type
{
DataAugmentation
.
FillMissing
}
,
cols
)
dict
=
Dict
(
)
map
(
cols
)
do
col
vals
=
skipmissing
(
Tables
.
getcolumn
(
td
.
table
,
col
)
)
dict
[
col
]
=
Statistics
.
median
(
vals
)
end
dict
end
function
gettransformdict
(
td
,
::
Type
{
DataAugmentation
.
Categorify
}
,
cols
)
dict
=
Dict
(
)
map
(
cols
)
do
col
vals
=
Tables
.
getcolumn
(
td
.
table
,
col
)
dict
[
col
]
=
unique
(
vals
)
end
dict
end
"""
getcoltypes(td::Datasets.TableDataset)
Returns the categorical and continuous columns present in a `TableDataset`.
"""
function
getcoltypes
(
td
::
TableDataset
)
schema
=
Tables
.
schema
(
td
.
table
)
contcols
=
Tuple
(
name
for
(
name
,
T
)
in
zip
(
schema
.
names
,
schema
.
types
)
if
T
<:
Union
{
<:
Number
,
<:
Union
{
Missing
,
<:
Number
}
}
)
catcols
=
Tuple
(
name
for
name
in
schema
.
names
if
!
(
name
in
contcols
)
)
catcols
,
contcols
end
"""
gettransforms(td::Datasets.TableDataset)
Returns a composition of basic tabular transformations constructed
for the given TableDataset.
"""
function
gettransforms
(
td
::
TableDataset
,
catcols
,
contcols
)
normstats
=
gettransformdict
(
td
,
DataAugmentation
.
NormalizeRow
,
contcols
)
fmvals
=
gettransformdict
(
td
,
DataAugmentation
.
FillMissing
,
contcols
)
catdict
=
gettransformdict
(
td
,
DataAugmentation
.
Categorify
,
catcols
)
normalize
=
DataAugmentation
.
NormalizeRow
(
normstats
,
contcols
)
categorify
=
DataAugmentation
.
Categorify
(
catdict
,
catcols
)
fm
=
DataAugmentation
.
FillMissing
(
fmvals
,
contcols
)
return
fm
|>
normalize
|>
categorify
end
gettransforms
(
td
::
TableDataset
)
=
gettransforms
(
td
,
getcoltypes
(
td
)
...
)
@
testset
"
TabularPreprocessing [encoding]
"
begin
cols
=
[
:
col1
,
:
col2
,
:
col3
,
:
col4
,
:
col5
]
vals
=
[
1
,
2
,
3
,
"
a
"
,
"
x
"
]
row
=
NamedTuple
(
zip
(
cols
,
vals
)
)
catcols
=
(
:
col4
,
:
col5
)
contcols
=
(
:
col1
,
:
col2
,
:
col3
)
col1_mean
,
col1_std
=
10
,
100
col2_mean
,
col2_std
=
100
,
10
col3_mean
,
col3_std
=
15
,
1
normdict
=
Dict
(
:
col1
=>
(
col1_mean
,
col1_std
)
,
:
col2
=>
(
col2_mean
,
col2_std
)
,
:
col3
=>
(
col3_mean
,
col3_std
)
)
tfm
=
TabularPreprocessing
(
DataAugmentation
.
NormalizeRow
(
normdict
,
contcols
)
)
block
=
TableRow
(
catcols
,
contcols
,
Dict
(
:
col4
=>
[
"
a
"
,
"
b
"
]
,
:
col5
=>
[
"
x
"
,
"
y
"
,
"
z
"
]
)
)
testencoding
(
tfm
,
block
,
row
)
testencoding
(
setup
(
TabularPreprocessing
,
block
,
TableDataset
(
DataFrame
(
[
row
,
row
]
)
)
)
,
block
,
row
)
end
@
testset
"
blockbackbone
"
begin
@
test_nowarn
FastAI
.
blockbackbone
(
EncodedTableRow
(
(
:
x
,
)
,
(
:
y
,
)
,
Dict
(
:
x
=>
[
1
,
2
,
]
)
)
)
end