Skip to content

Commit 8d2b407

Browse files
authored
Merge pull request #12 from rxavier/main
OrdinalEncoder starting at 1
2 parents fdc451f + 5ee2523 commit 8d2b407

File tree

5 files changed

+697
-49
lines changed

5 files changed

+697
-49
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Full documentation including this readme and API reference can be found at [RTD]
2626

2727
## Usage
2828

29-
Embedding Encoder works like any scikit-learn transformer, the only difference being that it requires `y` to be passed as it is the neural network's target. By default it will convert categorical variables into integer arrays by applying scikit-learn's `OrdinalEncoder`.
29+
Embedding Encoder works like any scikit-learn transformer, the only difference being that it requires `y` to be passed as it is the neural network's target.
3030

3131
Embedding Encoder will assume that all input columns are categorical and will calculate embeddings for each, unless the `numeric_vars` argument is passed. In that case, numeric variables will be included as an additional input to the neural network but no embeddings will be calculated for them, and they will not be included in the output transformation.
3232

embedding_encoder/core.py

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
import pandas as pd
77
import numpy as np
88
from sklearn.base import BaseEstimator, TransformerMixin
9-
from sklearn.preprocessing import OrdinalEncoder
9+
10+
from embedding_encoder.custom_ordinal import OrdinalEncoderStart1
1011

1112

1213
class EmbeddingEncoder(BaseEstimator, TransformerMixin):
@@ -16,9 +17,6 @@ class EmbeddingEncoder(BaseEstimator, TransformerMixin):
1617
embedding layers. Numeric variables can be included as additional inputs by setting
1718
:attr:`numeric_vars`.
1819
19-
By default, non numeric variables are encoded with scikit-learn's `OrdinalEncoder`. This
20-
can be changed by setting `encode=False` if no encoding is necessary.
21-
2220
Embedding Encoder returns (unique_values + 1) / 2 vectors per categorical variable, with a minimum of 2
2321
and a maximum of 50. However, this can be changed by passing a list of integers to :attr:`dimensions`.
2422
@@ -47,11 +45,6 @@ class EmbeddingEncoder(BaseEstimator, TransformerMixin):
4745
task :
4846
"regression" or "classification". This determines the units in the head layer, loss and
4947
metrics used.
50-
encode :
51-
Whether to apply `OrdinalEncoder` to categorical variables, by default True.
52-
unknown_category :
53-
Out of vocabulary values will be mapped to this category. This should match the unknown
54-
value used in OrdinalEncoder.
5548
numeric_vars :
5649
Array-like of strings containing the names of the numeric variables that will be included
5750
as inputs to the network.
@@ -86,7 +79,7 @@ class EmbeddingEncoder(BaseEstimator, TransformerMixin):
8679
keep_model :
8780
Whether to assign the Tensorflow model to :attr:`_model`. Setting to True will prevent the
8881
EmbeddingEncoder from being pickled. Default False. Please note that the model's `history`
89-
dict is available at :attr:`history`.
82+
dict is available at :attr:`_history`.
9083
9184
Attributes
9285
----------
@@ -112,8 +105,6 @@ class EmbeddingEncoder(BaseEstimator, TransformerMixin):
112105
def __init__(
113106
self,
114107
task: str,
115-
encode: bool = True,
116-
unknown_category: int = 999,
117108
numeric_vars: Optional[List[str]] = None,
118109
dimensions: Optional[List[int]] = None,
119110
layers_units: Optional[List[int]] = None,
@@ -137,8 +128,6 @@ def __init__(
137128
raise ValueError(
138129
"classif_classes and classif_loss must be None for regression"
139130
)
140-
self.encode = encode
141-
self.unknown_category = unknown_category
142131
self.numeric_vars = numeric_vars
143132
self.dimensions = dimensions
144133
self.layers_units = layers_units
@@ -229,13 +218,10 @@ def fit(
229218
self._categorical_vars = list(X_copy.columns)
230219
self._fit_dtypes = X_copy.dtypes
231220

232-
if self.encode:
233-
self._ordinal_encoder = OrdinalEncoder(
234-
handle_unknown="use_encoded_value", unknown_value=self.unknown_category
235-
)
236-
X_copy[self._categorical_vars] = self._ordinal_encoder.fit_transform(
237-
X_copy[self._categorical_vars]
238-
)
221+
self._ordinal_encoder = OrdinalEncoderStart1()
222+
X_copy[self._categorical_vars] = self._ordinal_encoder.fit_transform(
223+
X_copy[self._categorical_vars]
224+
)
239225

240226
if self.pretrained:
241227
self._embeddings_mapping = self.mapping_from_json()
@@ -356,7 +342,7 @@ def fit(
356342
self._embeddings_mapping = {
357343
k: pd.DataFrame(
358344
self._weights[k][0].numpy(),
359-
index=np.sort(np.append(X_copy[k].unique(), self.unknown_category)),
345+
index=np.sort(np.append(X_copy[k].unique(), 0)),
360346
columns=[
361347
f"embedding_{k}_{i}"
362348
for i in range(self._weights[k][0].shape[1])
@@ -418,10 +404,9 @@ def transform(self, X: pd.DataFrame) -> Union[pd.DataFrame, np.ndarray]:
418404
if not all(i in X_copy.columns for i in self._categorical_vars):
419405
raise ValueError("X must contain all categorical variables.")
420406

421-
if self.encode:
422-
X_copy[self._categorical_vars] = self._ordinal_encoder.transform(
423-
X_copy[self._categorical_vars]
424-
)
407+
X_copy[self._categorical_vars] = self._ordinal_encoder.transform(
408+
X_copy[self._categorical_vars]
409+
)
425410
final_embeddings = []
426411
for k in self._categorical_vars:
427412
final_embedding = X_copy.join(self._embeddings_mapping[k], on=k, how="left")
@@ -452,7 +437,9 @@ def inverse_transform(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
452437
X = np.array(X)
453438
X_copy = X.copy()
454439
if not isinstance(X_copy, pd.DataFrame):
455-
X_copy = pd.DataFrame(X_copy, columns=[f"cat{i}" for i in range(X_copy.shape[1])])
440+
X_copy = pd.DataFrame(
441+
X_copy, columns=[f"cat{i}" for i in range(X_copy.shape[1])]
442+
)
456443

457444
inverted_dfs = []
458445
for k in self._categorical_vars:
@@ -465,13 +452,9 @@ def inverse_transform(self, X: Union[pd.DataFrame, np.ndarray]) -> pd.DataFrame:
465452
inverted_dfs.append(inverted)
466453
output = pd.concat(inverted_dfs, axis=1)
467454

468-
if self.encode:
469-
original = self._ordinal_encoder.inverse_transform(output)
470-
original = pd.DataFrame(
471-
original, columns=output.columns, index=X_copy.index
472-
)
473-
else:
474-
original = output
455+
original = self._ordinal_encoder.inverse_transform(output)
456+
original = pd.DataFrame(original, columns=output.columns, index=X_copy.index)
457+
475458
original = original.astype(dict(zip(original.columns, self._fit_dtypes)))
476459
if isinstance(X, np.ndarray):
477460
original = original.values

0 commit comments

Comments
 (0)