demo_eda/lib/EncoderManager.py

import warnings

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler


class EncoderManager:
    """
    This class manages the encoding and decoding of a pandas DataFrame using LabelEncoder, OrdinalEncoder,
    and OneHotEncoder encoders. It stores a copy of the original DataFrame to enable multiple encodings and
    decodings. The class provides methods to encode/decode the DataFrame in-place or return a new encoded/decoded
    DataFrame. It also provides methods to access the encoded columns, the encoders used for encoding,
    and the encoded/decoded DataFrame.

    Parameters:
    -----------
    df_: pd.DataFrame
        The DataFrame to be encoded and decoded.
    label_cols: tuple
        The columns to encode with LabelEncoder.
    ordinal_cols: tuple
        The columns to encode with OrdinalEncoder.
    onehot_cols: tuple
        The columns to encode with OneHotEncoder.
    """

    def __init__(self, df_: pd.DataFrame, label_cols: tuple, ordinal_cols: tuple, onehot_cols: tuple):
        """
        Initialize the EncoderManager instance.

        Parameters:
        -----------
        df_: pd.DataFrame
            The DataFrame to be encoded and decoded.
        label_cols: tuple
            The columns to encode with LabelEncoder.
        ordinal_cols: tuple
            The columns to encode with OrdinalEncoder.
        onehot_cols: tuple
            The columns to encode with OneHotEncoder.
        """
        self.df = df_.copy()
        self.label_cols = label_cols
        self.ordinal_cols = ordinal_cols
        self.onehot_cols = onehot_cols
        self.encoders = {}
        self._is_encoded = False

        for col_ in self.label_cols:
            self.encoders[col_] = LabelEncoder()

        for col_ in self.ordinal_cols:
            self.encoders[col_] = OrdinalEncoder()

        for col_ in self.onehot_cols:
            self.encoders[col_] = OneHotEncoder(sparse=False)

    def encode(self, inplace: bool = False):
        """
        Encode the columns in the DataFrame according to their data types.

        Parameters:
        -----------
        inplace: bool
            Whether to modify the DataFrame in place. If True, the original DataFrame will be modified.
            If False, a copy of the DataFrame will be returned. Default is False.

        Returns:
        --------
        pd.DataFrame or None
            The encoded DataFrame, or None if the original DataFrame was modified in place.
        """
        # disable warnings
        warnings.filterwarnings("ignore")

        df_ = self.df.copy() if not inplace else self.df

        # prevent multiple encoding
        if not self._is_encoded:
            for col_ in self.label_cols:
                df_[col_] = self.encoders[col_].fit_transform(df_[col_])

            for col_ in self.ordinal_cols:
                df_[col_] = self.encoders[col_].fit_transform(df_[[col_]])

            for col_ in self.onehot_cols:
                encoded = self.encoders[col_].fit_transform(df_[[col_]])
                categories = self.encoders[col_].categories_[0]

                for i_, category in enumerate(categories):
                    df_[f"{col_}_{category}"] = encoded[:, i_]
                df_.drop(col_, axis=1, inplace=True)

        # enable warnings
        warnings.filterwarnings("default")

        if inplace:
            self._is_encoded = True
            return self
        else:
            return df_

    def decode(self, inplace: bool = False):
        """
        Decode the columns in the DataFrame according to their data types.

        Parameters:
        -----------
        inplace: bool
            Whether to modify the DataFrame in place. If True, the original DataFrame will be modified.
            If False, a copy of the DataFrame will be returned. Defaults to False.

        Returns:
        --------
        pd.DataFrame or None
            The decoded DataFrame, or None if the original DataFrame was modified in place.
        """
        # disable warnings
        warnings.filterwarnings("ignore")

        df_ = self.df.copy() if not inplace else self.df

        # prevent multiple decoding
        if self._is_encoded:
            for col_ in self.onehot_cols:
                categories = self.encoders[col_].categories_[0]

                for category in categories:
                    df_[col_] = df_.filter(like=f"{col_}_{category}").idxmax(axis=1).str.replace(f"{col_}_", "")
                df_.drop(df_.filter(like=f"{col_}_").columns, axis=1, inplace=True)

            for col_ in self.ordinal_cols:
                df_[col_] = self.encoders[col_].inverse_transform(df_[[col_]].astype(int))

            for col_ in self.label_cols:
                df_[col_] = self.encoders[col_].inverse_transform(df_[col_].astype(int))

        # enable warnings
        warnings.filterwarnings("default")

        if inplace:
            self._is_encoded = False
            return self
        else:
            return df_

    def get_df(self, as_copy: bool = False, last_encoded: bool = False):
        """
        Get the encoded or decoded DataFrame.

        Parameters:
        -----------
        as_copy: bool
            Whether to return a copy of the DataFrame.
        last_encoded: bool
            Whether to return the last encoded boolean. Used to determine if the DataFrame is encoded or not.

        Returns:
        --------
        Tuple[pd.DataFrame, bool]
            A tuple containing the encoded/decoded DataFrame and a boolean indicating if it is encoded or not.

        The boolean is only returned if as_copy is True. If not, it returns only the DataFrame and not
        the tuple.
        """
        if as_copy:
            if last_encoded:
                return self.df.copy(), self._is_encoded
            else:
                return self.df.copy()
        else:
            if last_encoded:
                return self.df, self._is_encoded
            else:
                return self.df

    def encoded_cols(self):
        """
        Get the arrays of columns used for encoding.

        Returns:
        --------
        Tuple[np.ndarray, np.ndarray, np.ndarray]
            A tuple containing three arrays of column names: onehot_cols, ordinal_cols, label_cols.
        """
        return self.onehot_cols, self.ordinal_cols, self.label_cols

    def normalize(self, method="min-max", inplace=False):
        """
        Normalize the specified columns of the dataframe using the specified method.
        This supports two methods: "min-max" and "z-score".

        Parameters:
        method (str): Normalization method to use, default "min-max".
        columns (list): List of columns to normalize. If None, normalize all columns.
        inplace (bool): If True, the dataframe is modified in place, otherwise a new dataframe is returned.

        Returns:
        normalized_df (pandas.DataFrame): Normalized dataframe if inplace=False, otherwise None.
        """

        df_numeric = self.df.select_dtypes(include=["float", "int"])

        if method == "min-max":
            scaler = MinMaxScaler()
        elif method == "z-score":
            scaler = StandardScaler()
        else:
            raise ValueError("Invalid normalization method specified.")

        df_normalized = scaler.fit_transform(df_numeric)

        if inplace:
            self.df[df_numeric.columns] = df_normalized
            return self
        else:
            df_ = self.df.copy()
            df_[df_numeric] = df_normalized
            return df_

    def get_encoders(self):
        """
        Get the encoders used for encoding.

        Returns:
            --------
            Dict[str, object]
                A dictionary containing the column names and their corresponding encoder object.
        """
        return self.encoders