|
| 1 | +""" |
| 2 | + NormalizeRow(dict, cols) |
| 3 | +
|
| 4 | +Normalizes the values of a row present in `TabularItem` for the columns |
| 5 | +specified in `cols` using `dict`, which contains the column names as |
| 6 | +dictionary keys and the mean and standard deviation tuple present as |
| 7 | +dictionary values. |
| 8 | +
|
| 9 | +## Example |
| 10 | +
|
| 11 | +```julia |
| 12 | +using DataAugmentation |
| 13 | +
|
| 14 | +cols = [:col1, :col2, :col3] |
| 15 | +row = (; zip(cols, [1, 2, 3])...) |
| 16 | +item = TabularItem(row, cols) |
| 17 | +normdict = Dict(:col1 => (1, 1), :col2 => (2, 2)) |
| 18 | +
|
| 19 | +tfm = NormalizeRow(normdict, [:col1, :col2]) |
| 20 | +apply(tfm, item) |
| 21 | +``` |
| 22 | +""" |
| 23 | +struct NormalizeRow{T, S} <: Transform |
| 24 | + dict::T |
| 25 | + cols::S |
| 26 | +end |
| 27 | + |
| 28 | +function apply(tfm::NormalizeRow, item::TabularItem; randstate=nothing) |
| 29 | + x = NamedTuple(Iterators.map(item.columns, item.data) do col, val |
| 30 | + if col in tfm.cols |
| 31 | + colmean, colstd = tfm.dict[col] |
| 32 | + val = (val - colmean)/colstd |
| 33 | + end |
| 34 | + (col, val) |
| 35 | + end) |
| 36 | + TabularItem(x, item.columns) |
| 37 | +end |
| 38 | + |
| 39 | +""" |
| 40 | + FillMissing(dict, cols) |
| 41 | +
|
| 42 | +Fills the missing values of a row present in `TabularItem` for the columns |
| 43 | +specified in `cols` using `dict`, which contains the column names as |
| 44 | +dictionary keys and the value to fill the column with present as |
| 45 | +dictionary values. |
| 46 | +
|
| 47 | +## Example |
| 48 | +
|
| 49 | +```julia |
| 50 | +using DataAugmentation |
| 51 | +
|
| 52 | +cols = [:col1, :col2, :col3] |
| 53 | +row = (; zip(cols, [1, 2, 3])...) |
| 54 | +item = TabularItem(row, cols) |
| 55 | +fmdict = Dict(:col1 => 100, :col2 => 100) |
| 56 | +
|
| 57 | +tfm = FillMissing(fmdict, [:col1, :col2]) |
| 58 | +apply(tfm, item) |
| 59 | +``` |
| 60 | +""" |
| 61 | +struct FillMissing{T, S} <: Transform |
| 62 | + dict::T |
| 63 | + cols::S |
| 64 | +end |
| 65 | + |
| 66 | +function apply(tfm::FillMissing, item::TabularItem; randstate=nothing) |
| 67 | + x = NamedTuple(Iterators.map(item.columns, item.data) do col, val |
| 68 | + if col in tfm.cols && ismissing(val) |
| 69 | + val = tfm.dict[col] |
| 70 | + end |
| 71 | + (col, val) |
| 72 | + end) |
| 73 | + TabularItem(x, item.columns) |
| 74 | +end |
| 75 | + |
| 76 | +""" |
| 77 | + Categorify(dict, cols) |
| 78 | +
|
| 79 | +Label encodes the values of a row present in `TabularItem` for the |
| 80 | +columns specified in `cols` using `dict`, which contains the column |
| 81 | +names as dictionary keys and the unique values of column present |
| 82 | +as dictionary values. |
| 83 | +
|
| 84 | +if there are any `missing` values in the values to be transformed, |
| 85 | +they are replaced by 1. |
| 86 | +
|
| 87 | +## Example |
| 88 | +
|
| 89 | +```julia |
| 90 | +using DataAugmentation |
| 91 | +
|
| 92 | +cols = [:col1, :col2, :col3] |
| 93 | +row = (; zip(cols, ["cat", 2, 3])...) |
| 94 | +item = TabularItem(row, cols) |
| 95 | +catdict = Dict(:col1 => ["dog", "cat"]) |
| 96 | +
|
| 97 | +tfm = Categorify(catdict, [:col1]) |
| 98 | +apply(tfm, item) |
| 99 | +``` |
| 100 | +""" |
| 101 | +struct Categorify{T, S} <: Transform |
| 102 | + dict::T |
| 103 | + cols::S |
| 104 | + function Categorify{T, S}(dict::T, cols::S) where {T, S} |
| 105 | + for (col, vals) in dict |
| 106 | + if any(ismissing, vals) |
| 107 | + dict[col] = filter(!ismissing, vals) |
| 108 | + @warn "There is a missing value present for category '$col' which will be removed from Categorify dict" |
| 109 | + end |
| 110 | + end |
| 111 | + new{T, S}(dict, cols) |
| 112 | + end |
| 113 | +end |
| 114 | + |
| 115 | +Categorify(dict::T, cols::S) where {T, S} = Categorify{T, S}(dict, cols) |
| 116 | + |
| 117 | +function apply(tfm::Categorify, item::TabularItem; randstate=nothing) |
| 118 | + x = NamedTuple(Iterators.map(item.columns, item.data) do col, val |
| 119 | + if col in tfm.cols |
| 120 | + val = ismissing(val) ? 1 : findfirst(val .== tfm.dict[col]) + 1 |
| 121 | + end |
| 122 | + (col, val) |
| 123 | + end) |
| 124 | + TabularItem(x, item.columns) |
| 125 | +end |
0 commit comments