cpa-analytics
diff --git a/‎README.md‎
Lines changed: 18 additions & 2 deletions b/‎README.md‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎docs/source/api.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/api.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎embedding_encoder/core.py‎
Lines changed: 59 additions & 1 deletion b/‎embedding_encoder/core.py‎
Lines changed: 59 additions & 1 deletion
diff --git a/‎embedding_encoder/examples/titanic.ipynb‎
Lines changed: 51 additions & 11 deletions b/‎embedding_encoder/examples/titanic.ipynb‎
Lines changed: 51 additions & 11 deletions
diff --git a/‎embedding_encoder/utils/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎embedding_encoder/utils/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎embedding_encoder/compose.py‎ ‎embedding_encoder/utils/compose.py‎embedding_encoder/compose.py renamed to embedding_encoder/utils/compose.py b/‎embedding_encoder/compose.py‎ ‎embedding_encoder/utils/compose.py‎embedding_encoder/compose.py renamed to embedding_encoder/utils/compose.py
diff --git a/‎embedding_encoder/custom_ordinal.py‎ ‎embedding_encoder/utils/custom_ordinal.py‎embedding_encoder/custom_ordinal.py renamed to embedding_encoder/utils/custom_ordinal.py b/‎embedding_encoder/custom_ordinal.py‎ ‎embedding_encoder/utils/custom_ordinal.py‎embedding_encoder/custom_ordinal.py renamed to embedding_encoder/utils/custom_ordinal.py
diff --git a/‎embedding_encoder/utils/plot.py‎
Lines changed: 63 additions & 0 deletions b/‎embedding_encoder/utils/plot.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎requirements-dev.txt‎
Lines changed: 10 additions & 7 deletions b/‎requirements-dev.txt‎
Lines changed: 10 additions & 7 deletions
diff --git a/‎requirements.in‎
Lines changed: 4 additions & 0 deletions b/‎requirements.in‎
Lines changed: 4 additions & 0 deletions
@@ -2,7 +2,9 @@
 
 ## Overview
 
-Embedding Encoder is a scikit-learn-compliant transformer that converts categorical variables to numeric vector representations. This is achieved by creating a small multilayer perceptron architecture in which each categorical variable is passed through an embedding layer, for which weights are extracted and turned into DataFrame columns.
+Embedding Encoder is a scikit-learn-compliant transformer that converts categorical variables into numeric vector representations. This is achieved by creating a small multilayer perceptron architecture in which each categorical variable is passed through an embedding layer, for which weights are extracted and turned into DataFrame columns.
+
+While the idea is not new (it was popularized after [the team that landed in the 3rd place of the Rossmann Kaggle competition used it](https://www.kaggle.com/c/rossmann-store-sales/discussion/17974)), and although Python implementations have surfaced over the years, we are not aware of any library that integrates this functionality into scikit-learn.
 
 ## Installation and dependencies
 
@@ -88,7 +90,7 @@ from sklearn.preprocessing import StandardScaler
 from sklearn.impute import SimpleImputer
 
 from embedding_encoder import EmbeddingEncoder
-from embedding_encoder.compose import ColumnTransformerWithNames
+from embedding_encoder.utils import ColumnTransformerWithNames
 
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
 
@@ -106,6 +108,20 @@ pipe.fit(X_train, y_train)
 
 Like scikit transformers, Embedding Encoder also has a `inverse_transform` method that recomposes the original input.
 
+## Plotting embeddings
+
+The idea behind embeddings is that categories that are conceptually similar should have similar vector representations. For example, "December" and "January" should be close to each other when the target variable is ice cream sales.
+
+This can be analyzed with the `plot_embeddings` function.
+
+```python
+from embedding_encoder import EmbeddingEncoder
+
+ee = EmbeddingEncoder(task="classification")
+ee.fit(X=X, y=y)
+plot_embeddings(ee, variable="", )
+```
+
 ## Advanced usage
 
 Embedding Encoder gives some control over the neural network. In particular, its constructor allows setting how deep and large the network should be (by modifying `layers_units`), as well as the dropout rate between dense layers. Epochs and batch size can also be modified.
 
@@ -15,7 +15,7 @@ EmbeddingEncoder class
 
 Utilities
 ---------------
-.. automodule:: embedding_encoder.compose
+.. automodule:: embedding_encoder.utils.compose
    :members:
    :undoc-members:
    :show-inheritance:
@@ -7,7 +7,7 @@
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
 
-from embedding_encoder.custom_ordinal import OrdinalEncoderStart1
+from embedding_encoder.utils import OrdinalEncoderStart1
 
 
 class EmbeddingEncoder(BaseEstimator, TransformerMixin):
@@ -465,3 +465,61 @@ def get_feature_names_out(self, input_features=None):
 
     def get_feature_names(self, input_features=None):
         return self._columns_out
+
+    def plot_embeddings(self, variable: str, model: str = "pca"):
+        """Plot embeddings for a variable by passing a fitted EmbeddingEncoder and reducing to 2D.
+
+        Parameters
+        ----------
+        variable :
+            Variable to plot. Please note that scikit-learn's Pipeline might strip column names.
+        model : str, optional
+            Dimensionality reduction model. Either "tsne" or "pca". Default "pca".
+
+        Returns
+        -------
+        matplotlib.axes._subplots.AxesSubplot
+            Seaborn scatterplot (Matplotlib axes)
+
+        Raises
+        ------
+        ValueError
+            If selected variable has less than 3 unique values.
+        ValueError
+            If selected model is not "tsne" or "pca".
+        ImportError
+            If seaborn is not installed.
+        """
+        if self._embeddings_mapping[variable].shape[0] < 3:
+            raise ValueError("Nothing to plot when variable has less than 3 unique values.")
+        dimensions = 2
+        if model not in ["tsne", "pca"]:
+            raise ValueError("model must be either 'tsne' or 'pca'.")
+        try:
+            import seaborn as sns
+            sns.set(rc={"figure.figsize": (8, 6), "figure.dpi": 100})
+        except ImportError:
+            raise ImportError("Plotting requires seaborn.")
+        if model == "tsne":
+            from sklearn.manifold import TSNE
+
+            model = TSNE(init="pca", n_components=dimensions, learning_rate="auto")
+        else:
+            from sklearn.decomposition import PCA
+
+            model = PCA(n_components=dimensions)
+
+        embeddings = self._embeddings_mapping[variable]
+        variable_position = self._categorical_vars.index(variable)
+        original_classes = self._ordinal_encoder.categories_[variable_position]
+        original_index = ["OOV"] + list(original_classes)
+
+        reduced = model.fit_transform(embeddings)
+        reduced = pd.DataFrame(
+            reduced,
+            index=original_index,
+            columns=[f"Component {i}" for i in range(dimensions)],
+        ).rename_axis("Classes").reset_index()
+        plot = sns.scatterplot(data=reduced, x="Component 0", y="Component 1", hue="Classes", s=100)
+        plot.set_title(f"{model.__class__.__name__} embeddings projection for variable '{variable}'")
+        return plot
@@ -0,0 +1,5 @@
+from embedding_encoder.utils.plot import plot_embeddings
+from embedding_encoder.utils.compose import ColumnTransformerWithNames
+from embedding_encoder.utils.custom_ordinal import OrdinalEncoderStart1
+
+__all__ = ["plot_embeddings", "ColumnTransformerWithNames", "OrdinalEncoderStart1"]
@@ -0,0 +1,63 @@
+import pandas as pd
+
+
+def plot_embeddings(embedding_encoder, variable: str, model: str = "pca"):
+    """Plot embeddings for a variable by passing a fitted EmbeddingEncoder and reducing to 2D.
+
+    Parameters
+    ----------
+    embedding_encoder : EmbeddingEncoder
+        Fitted transformer.
+    variable :
+        Variable to plot. Please note that scikit-learn's Pipeline might strip column names.
+    model : str, optional
+        Dimensionality reduction model. Either "tsne" or "pca". Default "pca".
+
+    Returns
+    -------
+    matplotlib.axes._subplots.AxesSubplot
+        Seaborn scatterplot (Matplotlib axes)
+
+    Raises
+    ------
+    ValueError
+        If selected variable has less than 3 unique values.
+    ValueError
+        If selected model is not "tsne" or "pca".
+    ImportError
+        If seaborn is not installed.
+    """
+    if embedding_encoder._embeddings_mapping[variable].shape[0] < 3:
+        raise ValueError("Nothing to plot when variable has less than 3 unique values.")
+    dimensions = 2
+    if model not in ["tsne", "pca"]:
+        raise ValueError("model must be either 'tsne' or 'pca'.")
+    try:
+        import seaborn as sns
+        sns.set(rc={"figure.figsize": (8, 6), "figure.dpi": 100})
+        sns.set_palette("viridis")
+    except ImportError:
+        raise ImportError("Plotting requires seaborn.")
+    if model == "tsne":
+        from sklearn.manifold import TSNE
+
+        model = TSNE(init="pca", n_components=dimensions, learning_rate="auto")
+    else:
+        from sklearn.decomposition import PCA
+
+        model = PCA(n_components=dimensions)
+
+    embeddings = embedding_encoder._embeddings_mapping[variable]
+    variable_position = embedding_encoder._categorical_vars.index(variable)
+    original_classes = embedding_encoder._ordinal_encoder.categories_[variable_position]
+    original_index = ["OOV"] + list(original_classes)
+
+    reduced = model.fit_transform(embeddings)
+    reduced = pd.DataFrame(
+        reduced,
+        index=original_index,
+        columns=[f"Component {i}" for i in range(dimensions)],
+    ).rename_axis("Classes").reset_index()
+    plot = sns.scatterplot(data=reduced, x="Component 0", y="Component 1", hue="Classes", s=100)
+    plot.set_title(f"{model.__class__.__name__} embeddings projection for variable '{variable}'")
+    return plot
@@ -28,7 +28,7 @@ certifi==2021.10.8
     # via
     #   -c requirements.txt
     #   requests
-charset-normalizer==2.0.10
+charset-normalizer==2.0.12
     # via
     #   -c requirements.txt
     #   requests
@@ -78,7 +78,7 @@ idna==3.3
     #   requests
 imagesize==1.3.0
     # via sphinx
-importlib-metadata==4.10.1
+importlib-metadata==4.11.1
     # via
     #   -c requirements.txt
     #   click
@@ -110,6 +110,7 @@ natsort==8.0.2
     # via domdf-python-tools
 packaging==21.3
     # via
+    #   -c requirements.txt
     #   deprecation-alias
     #   pytest
     #   sphinx
@@ -126,8 +127,10 @@ pygments==2.10.0
     #   sphinx
     #   sphinx-prompt
     #   sphinx-tabs
-pyparsing==3.0.6
-    # via packaging
+pyparsing==3.0.7
+    # via
+    #   -c requirements.txt
+    #   packaging
 pytest==6.2.5
     # via -r requirements-dev.in
 pytz==2021.3
@@ -142,9 +145,9 @@ requests==2.27.1
     #   apeye
     #   cachecontrol
     #   sphinx
-ruamel.yaml==0.17.20
+ruamel-yaml==0.17.21
     # via sphinx-toolbox
-ruamel.yaml.clib==0.2.6
+ruamel-yaml-clib==0.2.6
     # via ruamel.yaml
 six==1.16.0
     # via
@@ -201,7 +204,7 @@ tornado==6.1
     # via livereload
 typed-ast==1.5.1
     # via black
-typing-extensions==4.0.1
+typing-extensions==4.1.1
     # via
     #   -c requirements.txt
     #   black
 
@@ -0,0 +1,4 @@
+scikit-learn
+pandas
+tensorflow>=2.8.0
+seaborn
-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +scikit-learn
 +pandas
 +tensorflow>=2.8.0
 +seaborn