import warnings import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder from sklearn.preprocessing import MinMaxScaler, StandardScaler class EncoderManager: """ This class manages the encoding and decoding of a pandas DataFrame using LabelEncoder, OrdinalEncoder, and OneHotEncoder encoders. It stores a copy of the original DataFrame to enable multiple encodings and decodings. The class provides methods to encode/decode the DataFrame in-place or return a new encoded/decoded DataFrame. It also provides methods to access the encoded columns, the encoders used for encoding, and the encoded/decoded DataFrame. Parameters: ----------- df_: pd.DataFrame The DataFrame to be encoded and decoded. label_cols: tuple The columns to encode with LabelEncoder. ordinal_cols: tuple The columns to encode with OrdinalEncoder. onehot_cols: tuple The columns to encode with OneHotEncoder. """ def __init__(self, df_: pd.DataFrame, label_cols: tuple, ordinal_cols: tuple, onehot_cols: tuple): """ Initialize the EncoderManager instance. Parameters: ----------- df_: pd.DataFrame The DataFrame to be encoded and decoded. label_cols: tuple The columns to encode with LabelEncoder. ordinal_cols: tuple The columns to encode with OrdinalEncoder. onehot_cols: tuple The columns to encode with OneHotEncoder. """ self.df = df_.copy() self.label_cols = label_cols self.ordinal_cols = ordinal_cols self.onehot_cols = onehot_cols self.encoders = {} self._is_encoded = False for col_ in self.label_cols: self.encoders[col_] = LabelEncoder() for col_ in self.ordinal_cols: self.encoders[col_] = OrdinalEncoder() for col_ in self.onehot_cols: self.encoders[col_] = OneHotEncoder(sparse=False) def encode(self, inplace: bool = False): """ Encode the columns in the DataFrame according to their data types. Parameters: ----------- inplace: bool Whether to modify the DataFrame in place. If True, the original DataFrame will be modified. If False, a copy of the DataFrame will be returned. Default is False. Returns: -------- pd.DataFrame or None The encoded DataFrame, or None if the original DataFrame was modified in place. """ # disable warnings warnings.filterwarnings("ignore") df_ = self.df.copy() if not inplace else self.df # prevent multiple encoding if not self._is_encoded: for col_ in self.label_cols: df_[col_] = self.encoders[col_].fit_transform(df_[col_]) for col_ in self.ordinal_cols: df_[col_] = self.encoders[col_].fit_transform(df_[[col_]]) for col_ in self.onehot_cols: encoded = self.encoders[col_].fit_transform(df_[[col_]]) categories = self.encoders[col_].categories_[0] for i_, category in enumerate(categories): df_[f"{col_}_{category}"] = encoded[:, i_] df_.drop(col_, axis=1, inplace=True) # enable warnings warnings.filterwarnings("default") if inplace: self._is_encoded = True return self else: return df_ def decode(self, inplace: bool = False): """ Decode the columns in the DataFrame according to their data types. Parameters: ----------- inplace: bool Whether to modify the DataFrame in place. If True, the original DataFrame will be modified. If False, a copy of the DataFrame will be returned. Defaults to False. Returns: -------- pd.DataFrame or None The decoded DataFrame, or None if the original DataFrame was modified in place. """ # disable warnings warnings.filterwarnings("ignore") df_ = self.df.copy() if not inplace else self.df # prevent multiple decoding if self._is_encoded: for col_ in self.onehot_cols: categories = self.encoders[col_].categories_[0] for category in categories: df_[col_] = df_.filter(like=f"{col_}_{category}").idxmax(axis=1).str.replace(f"{col_}_", "") df_.drop(df_.filter(like=f"{col_}_").columns, axis=1, inplace=True) for col_ in self.ordinal_cols: df_[col_] = self.encoders[col_].inverse_transform(df_[[col_]].astype(int)) for col_ in self.label_cols: df_[col_] = self.encoders[col_].inverse_transform(df_[col_].astype(int)) # enable warnings warnings.filterwarnings("default") if inplace: self._is_encoded = False return self else: return df_ def get_df(self, as_copy: bool = False, last_encoded: bool = False): """ Get the encoded or decoded DataFrame. Parameters: ----------- as_copy: bool Whether to return a copy of the DataFrame. last_encoded: bool Whether to return the last encoded boolean. Used to determine if the DataFrame is encoded or not. Returns: -------- Tuple[pd.DataFrame, bool] A tuple containing the encoded/decoded DataFrame and a boolean indicating if it is encoded or not. The boolean is only returned if as_copy is True. If not, it returns only the DataFrame and not the tuple. """ if as_copy: if last_encoded: return self.df.copy(), self._is_encoded else: return self.df.copy() else: if last_encoded: return self.df, self._is_encoded else: return self.df def encoded_cols(self): """ Get the arrays of columns used for encoding. Returns: -------- Tuple[np.ndarray, np.ndarray, np.ndarray] A tuple containing three arrays of column names: onehot_cols, ordinal_cols, label_cols. """ return self.onehot_cols, self.ordinal_cols, self.label_cols def normalize(self, method="min-max", inplace=False): """ Normalize the specified columns of the dataframe using the specified method. This supports two methods: "min-max" and "z-score". Parameters: method (str): Normalization method to use, default "min-max". columns (list): List of columns to normalize. If None, normalize all columns. inplace (bool): If True, the dataframe is modified in place, otherwise a new dataframe is returned. Returns: normalized_df (pandas.DataFrame): Normalized dataframe if inplace=False, otherwise None. """ df_numeric = self.df.select_dtypes(include=["float", "int"]) if method == "min-max": scaler = MinMaxScaler() elif method == "z-score": scaler = StandardScaler() else: raise ValueError("Invalid normalization method specified.") df_normalized = scaler.fit_transform(df_numeric) if inplace: self.df[df_numeric.columns] = df_normalized return self else: df_ = self.df.copy() df_[df_numeric] = df_normalized return df_ def get_encoders(self): """ Get the encoders used for encoding. Returns: -------- Dict[str, object] A dictionary containing the column names and their corresponding encoder object. """ return self.encoders