first commit
This commit is contained in:
commit
7aa9fef838
90 changed files with 31987 additions and 0 deletions
40
lib/EncodeManagerTester.py
Normal file
40
lib/EncodeManagerTester.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
import pandas as pd
|
||||
from EncoderManager import EncoderManager
|
||||
|
||||
|
||||
def main():
|
||||
# Define example data
|
||||
data = {
|
||||
'Color': ['Red', 'Blue', 'Green', 'Red', 'Green', 'Blue'],
|
||||
'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Male'],
|
||||
'Size': ['M', 'S', 'L', 'M', 'XL', 'M'],
|
||||
'Species': ['Dog', 'Cat', 'Bird', 'Dog', 'Bird', 'Cat'],
|
||||
'Count': [2, 1, 3, 4, 1, 2]
|
||||
}
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Define encoding parameters
|
||||
label_cols = ('Gender','Species')
|
||||
ordinal_cols = ('Size',)
|
||||
onehot_cols = ('Color',)
|
||||
|
||||
# Create encoder manager
|
||||
encoder = EncoderManager(df, label_cols, ordinal_cols, onehot_cols)
|
||||
|
||||
# Test encoding
|
||||
encoded_df = encoder.encode(inplace=False)
|
||||
print("Encoded DataFrame:")
|
||||
print(encoded_df)
|
||||
|
||||
# Test decoding
|
||||
decoded_df = encoder.decode(inplace=False)
|
||||
print("Decoded DataFrame:")
|
||||
print(decoded_df)
|
||||
|
||||
df_normalized = encoder.normalize('min-max', inplace=True).get_df()
|
||||
print("Normalized DataFrame:")
|
||||
print(df_normalized)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
233
lib/EncoderManager.py
Normal file
233
lib/EncoderManager.py
Normal file
|
@ -0,0 +1,233 @@
|
|||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
|
||||
from sklearn.preprocessing import MinMaxScaler, StandardScaler
|
||||
|
||||
|
||||
class EncoderManager:
|
||||
"""
|
||||
This class manages the encoding and decoding of a pandas DataFrame using LabelEncoder, OrdinalEncoder,
|
||||
and OneHotEncoder encoders. It stores a copy of the original DataFrame to enable multiple encodings and
|
||||
decodings. The class provides methods to encode/decode the DataFrame in-place or return a new encoded/decoded
|
||||
DataFrame. It also provides methods to access the encoded columns, the encoders used for encoding,
|
||||
and the encoded/decoded DataFrame.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
df_: pd.DataFrame
|
||||
The DataFrame to be encoded and decoded.
|
||||
label_cols: tuple
|
||||
The columns to encode with LabelEncoder.
|
||||
ordinal_cols: tuple
|
||||
The columns to encode with OrdinalEncoder.
|
||||
onehot_cols: tuple
|
||||
The columns to encode with OneHotEncoder.
|
||||
"""
|
||||
|
||||
def __init__(self, df_: pd.DataFrame, label_cols: tuple, ordinal_cols: tuple, onehot_cols: tuple):
|
||||
"""
|
||||
Initialize the EncoderManager instance.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
df_: pd.DataFrame
|
||||
The DataFrame to be encoded and decoded.
|
||||
label_cols: tuple
|
||||
The columns to encode with LabelEncoder.
|
||||
ordinal_cols: tuple
|
||||
The columns to encode with OrdinalEncoder.
|
||||
onehot_cols: tuple
|
||||
The columns to encode with OneHotEncoder.
|
||||
"""
|
||||
self.df = df_.copy()
|
||||
self.label_cols = label_cols
|
||||
self.ordinal_cols = ordinal_cols
|
||||
self.onehot_cols = onehot_cols
|
||||
self.encoders = {}
|
||||
self._is_encoded = False
|
||||
|
||||
for col_ in self.label_cols:
|
||||
self.encoders[col_] = LabelEncoder()
|
||||
|
||||
for col_ in self.ordinal_cols:
|
||||
self.encoders[col_] = OrdinalEncoder()
|
||||
|
||||
for col_ in self.onehot_cols:
|
||||
self.encoders[col_] = OneHotEncoder(sparse=False)
|
||||
|
||||
def encode(self, inplace: bool = False):
|
||||
"""
|
||||
Encode the columns in the DataFrame according to their data types.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
inplace: bool
|
||||
Whether to modify the DataFrame in place. If True, the original DataFrame will be modified.
|
||||
If False, a copy of the DataFrame will be returned. Default is False.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
pd.DataFrame or None
|
||||
The encoded DataFrame, or None if the original DataFrame was modified in place.
|
||||
"""
|
||||
# disable warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
df_ = self.df.copy() if not inplace else self.df
|
||||
|
||||
# prevent multiple encoding
|
||||
if not self._is_encoded:
|
||||
for col_ in self.label_cols:
|
||||
df_[col_] = self.encoders[col_].fit_transform(df_[col_])
|
||||
|
||||
for col_ in self.ordinal_cols:
|
||||
df_[col_] = self.encoders[col_].fit_transform(df_[[col_]])
|
||||
|
||||
for col_ in self.onehot_cols:
|
||||
encoded = self.encoders[col_].fit_transform(df_[[col_]])
|
||||
categories = self.encoders[col_].categories_[0]
|
||||
|
||||
for i_, category in enumerate(categories):
|
||||
df_[f"{col_}_{category}"] = encoded[:, i_]
|
||||
df_.drop(col_, axis=1, inplace=True)
|
||||
|
||||
# enable warnings
|
||||
warnings.filterwarnings("default")
|
||||
|
||||
if inplace:
|
||||
self._is_encoded = True
|
||||
return self
|
||||
else:
|
||||
return df_
|
||||
|
||||
def decode(self, inplace: bool = False):
|
||||
"""
|
||||
Decode the columns in the DataFrame according to their data types.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
inplace: bool
|
||||
Whether to modify the DataFrame in place. If True, the original DataFrame will be modified.
|
||||
If False, a copy of the DataFrame will be returned. Defaults to False.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
pd.DataFrame or None
|
||||
The decoded DataFrame, or None if the original DataFrame was modified in place.
|
||||
"""
|
||||
# disable warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
df_ = self.df.copy() if not inplace else self.df
|
||||
|
||||
# prevent multiple decoding
|
||||
if self._is_encoded:
|
||||
for col_ in self.onehot_cols:
|
||||
categories = self.encoders[col_].categories_[0]
|
||||
|
||||
for category in categories:
|
||||
df_[col_] = df_.filter(like=f"{col_}_{category}").idxmax(axis=1).str.replace(f"{col_}_", "")
|
||||
df_.drop(df_.filter(like=f"{col_}_").columns, axis=1, inplace=True)
|
||||
|
||||
for col_ in self.ordinal_cols:
|
||||
df_[col_] = self.encoders[col_].inverse_transform(df_[[col_]].astype(int))
|
||||
|
||||
for col_ in self.label_cols:
|
||||
df_[col_] = self.encoders[col_].inverse_transform(df_[col_].astype(int))
|
||||
|
||||
# enable warnings
|
||||
warnings.filterwarnings("default")
|
||||
|
||||
if inplace:
|
||||
self._is_encoded = False
|
||||
return self
|
||||
else:
|
||||
return df_
|
||||
|
||||
def get_df(self, as_copy: bool = False, last_encoded: bool = False):
|
||||
"""
|
||||
Get the encoded or decoded DataFrame.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
as_copy: bool
|
||||
Whether to return a copy of the DataFrame.
|
||||
last_encoded: bool
|
||||
Whether to return the last encoded boolean. Used to determine if the DataFrame is encoded or not.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
Tuple[pd.DataFrame, bool]
|
||||
A tuple containing the encoded/decoded DataFrame and a boolean indicating if it is encoded or not.
|
||||
|
||||
The boolean is only returned if as_copy is True. If not, it returns only the DataFrame and not
|
||||
the tuple.
|
||||
"""
|
||||
if as_copy:
|
||||
if last_encoded:
|
||||
return self.df.copy(), self._is_encoded
|
||||
else:
|
||||
return self.df.copy()
|
||||
else:
|
||||
if last_encoded:
|
||||
return self.df, self._is_encoded
|
||||
else:
|
||||
return self.df
|
||||
|
||||
def encoded_cols(self):
|
||||
"""
|
||||
Get the arrays of columns used for encoding.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
Tuple[np.ndarray, np.ndarray, np.ndarray]
|
||||
A tuple containing three arrays of column names: onehot_cols, ordinal_cols, label_cols.
|
||||
"""
|
||||
return self.onehot_cols, self.ordinal_cols, self.label_cols
|
||||
|
||||
def normalize(self, method="min-max", inplace=False):
|
||||
"""
|
||||
Normalize the specified columns of the dataframe using the specified method.
|
||||
This supports two methods: "min-max" and "z-score".
|
||||
|
||||
Parameters:
|
||||
method (str): Normalization method to use, default "min-max".
|
||||
columns (list): List of columns to normalize. If None, normalize all columns.
|
||||
inplace (bool): If True, the dataframe is modified in place, otherwise a new dataframe is returned.
|
||||
|
||||
Returns:
|
||||
normalized_df (pandas.DataFrame): Normalized dataframe if inplace=False, otherwise None.
|
||||
"""
|
||||
|
||||
df_numeric = self.df.select_dtypes(include=["float", "int"])
|
||||
|
||||
if method == "min-max":
|
||||
scaler = MinMaxScaler()
|
||||
elif method == "z-score":
|
||||
scaler = StandardScaler()
|
||||
else:
|
||||
raise ValueError("Invalid normalization method specified.")
|
||||
|
||||
df_normalized = scaler.fit_transform(df_numeric)
|
||||
|
||||
if inplace:
|
||||
self.df[df_numeric.columns] = df_normalized
|
||||
return self
|
||||
else:
|
||||
df_ = self.df.copy()
|
||||
df_[df_numeric] = df_normalized
|
||||
return df_
|
||||
|
||||
def get_encoders(self):
|
||||
"""
|
||||
Get the encoders used for encoding.
|
||||
|
||||
Returns:
|
||||
--------
|
||||
Dict[str, object]
|
||||
A dictionary containing the column names and their corresponding encoder object.
|
||||
"""
|
||||
return self.encoders
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue