demo_eda/lib/EncoderManager.py
2025-07-12 01:17:12 +02:00

233 lines
7.8 KiB
Python

import warnings
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
class EncoderManager:
"""
This class manages the encoding and decoding of a pandas DataFrame using LabelEncoder, OrdinalEncoder,
and OneHotEncoder encoders. It stores a copy of the original DataFrame to enable multiple encodings and
decodings. The class provides methods to encode/decode the DataFrame in-place or return a new encoded/decoded
DataFrame. It also provides methods to access the encoded columns, the encoders used for encoding,
and the encoded/decoded DataFrame.
Parameters:
-----------
df_: pd.DataFrame
The DataFrame to be encoded and decoded.
label_cols: tuple
The columns to encode with LabelEncoder.
ordinal_cols: tuple
The columns to encode with OrdinalEncoder.
onehot_cols: tuple
The columns to encode with OneHotEncoder.
"""
def __init__(self, df_: pd.DataFrame, label_cols: tuple, ordinal_cols: tuple, onehot_cols: tuple):
"""
Initialize the EncoderManager instance.
Parameters:
-----------
df_: pd.DataFrame
The DataFrame to be encoded and decoded.
label_cols: tuple
The columns to encode with LabelEncoder.
ordinal_cols: tuple
The columns to encode with OrdinalEncoder.
onehot_cols: tuple
The columns to encode with OneHotEncoder.
"""
self.df = df_.copy()
self.label_cols = label_cols
self.ordinal_cols = ordinal_cols
self.onehot_cols = onehot_cols
self.encoders = {}
self._is_encoded = False
for col_ in self.label_cols:
self.encoders[col_] = LabelEncoder()
for col_ in self.ordinal_cols:
self.encoders[col_] = OrdinalEncoder()
for col_ in self.onehot_cols:
self.encoders[col_] = OneHotEncoder(sparse=False)
def encode(self, inplace: bool = False):
"""
Encode the columns in the DataFrame according to their data types.
Parameters:
-----------
inplace: bool
Whether to modify the DataFrame in place. If True, the original DataFrame will be modified.
If False, a copy of the DataFrame will be returned. Default is False.
Returns:
--------
pd.DataFrame or None
The encoded DataFrame, or None if the original DataFrame was modified in place.
"""
# disable warnings
warnings.filterwarnings("ignore")
df_ = self.df.copy() if not inplace else self.df
# prevent multiple encoding
if not self._is_encoded:
for col_ in self.label_cols:
df_[col_] = self.encoders[col_].fit_transform(df_[col_])
for col_ in self.ordinal_cols:
df_[col_] = self.encoders[col_].fit_transform(df_[[col_]])
for col_ in self.onehot_cols:
encoded = self.encoders[col_].fit_transform(df_[[col_]])
categories = self.encoders[col_].categories_[0]
for i_, category in enumerate(categories):
df_[f"{col_}_{category}"] = encoded[:, i_]
df_.drop(col_, axis=1, inplace=True)
# enable warnings
warnings.filterwarnings("default")
if inplace:
self._is_encoded = True
return self
else:
return df_
def decode(self, inplace: bool = False):
"""
Decode the columns in the DataFrame according to their data types.
Parameters:
-----------
inplace: bool
Whether to modify the DataFrame in place. If True, the original DataFrame will be modified.
If False, a copy of the DataFrame will be returned. Defaults to False.
Returns:
--------
pd.DataFrame or None
The decoded DataFrame, or None if the original DataFrame was modified in place.
"""
# disable warnings
warnings.filterwarnings("ignore")
df_ = self.df.copy() if not inplace else self.df
# prevent multiple decoding
if self._is_encoded:
for col_ in self.onehot_cols:
categories = self.encoders[col_].categories_[0]
for category in categories:
df_[col_] = df_.filter(like=f"{col_}_{category}").idxmax(axis=1).str.replace(f"{col_}_", "")
df_.drop(df_.filter(like=f"{col_}_").columns, axis=1, inplace=True)
for col_ in self.ordinal_cols:
df_[col_] = self.encoders[col_].inverse_transform(df_[[col_]].astype(int))
for col_ in self.label_cols:
df_[col_] = self.encoders[col_].inverse_transform(df_[col_].astype(int))
# enable warnings
warnings.filterwarnings("default")
if inplace:
self._is_encoded = False
return self
else:
return df_
def get_df(self, as_copy: bool = False, last_encoded: bool = False):
"""
Get the encoded or decoded DataFrame.
Parameters:
-----------
as_copy: bool
Whether to return a copy of the DataFrame.
last_encoded: bool
Whether to return the last encoded boolean. Used to determine if the DataFrame is encoded or not.
Returns:
--------
Tuple[pd.DataFrame, bool]
A tuple containing the encoded/decoded DataFrame and a boolean indicating if it is encoded or not.
The boolean is only returned if as_copy is True. If not, it returns only the DataFrame and not
the tuple.
"""
if as_copy:
if last_encoded:
return self.df.copy(), self._is_encoded
else:
return self.df.copy()
else:
if last_encoded:
return self.df, self._is_encoded
else:
return self.df
def encoded_cols(self):
"""
Get the arrays of columns used for encoding.
Returns:
--------
Tuple[np.ndarray, np.ndarray, np.ndarray]
A tuple containing three arrays of column names: onehot_cols, ordinal_cols, label_cols.
"""
return self.onehot_cols, self.ordinal_cols, self.label_cols
def normalize(self, method="min-max", inplace=False):
"""
Normalize the specified columns of the dataframe using the specified method.
This supports two methods: "min-max" and "z-score".
Parameters:
method (str): Normalization method to use, default "min-max".
columns (list): List of columns to normalize. If None, normalize all columns.
inplace (bool): If True, the dataframe is modified in place, otherwise a new dataframe is returned.
Returns:
normalized_df (pandas.DataFrame): Normalized dataframe if inplace=False, otherwise None.
"""
df_numeric = self.df.select_dtypes(include=["float", "int"])
if method == "min-max":
scaler = MinMaxScaler()
elif method == "z-score":
scaler = StandardScaler()
else:
raise ValueError("Invalid normalization method specified.")
df_normalized = scaler.fit_transform(df_numeric)
if inplace:
self.df[df_numeric.columns] = df_normalized
return self
else:
df_ = self.df.copy()
df_[df_numeric] = df_normalized
return df_
def get_encoders(self):
"""
Get the encoders used for encoding.
Returns:
--------
Dict[str, object]
A dictionary containing the column names and their corresponding encoder object.
"""
return self.encoders