first commit

2025-07-12 01:17:12 +02:00 · 2025-07-12 01:17:12 +02:00 · 7aa9fef838
commit 7aa9fef838
90 changed files with 31987 additions and 0 deletions
--- a/lib/EncodeManagerTester.py
+++ b/lib/EncodeManagerTester.py
@ -0,0 +1,40 @@
+import pandas as pd
+from EncoderManager import EncoderManager
+
+
+def main():
+    # Define example data
+    data = {
+        'Color': ['Red', 'Blue', 'Green', 'Red', 'Green', 'Blue'],
+        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Male'],
+        'Size': ['M', 'S', 'L', 'M', 'XL', 'M'],
+        'Species': ['Dog', 'Cat', 'Bird', 'Dog', 'Bird', 'Cat'],
+        'Count': [2, 1, 3, 4, 1, 2]
+    }
+    df = pd.DataFrame(data)
+
+    # Define encoding parameters
+    label_cols = ('Gender','Species')
+    ordinal_cols = ('Size',)
+    onehot_cols = ('Color',)
+
+    # Create encoder manager
+    encoder = EncoderManager(df, label_cols, ordinal_cols, onehot_cols)
+
+    # Test encoding
+    encoded_df = encoder.encode(inplace=False)
+    print("Encoded DataFrame:")
+    print(encoded_df)
+
+    # Test decoding
+    decoded_df = encoder.decode(inplace=False)
+    print("Decoded DataFrame:")
+    print(decoded_df)
+
+    df_normalized = encoder.normalize('min-max', inplace=True).get_df()
+    print("Normalized DataFrame:")
+    print(df_normalized)
+
+
+if __name__ == "__main__":
+    main()
--- a/lib/EncoderManager.py
+++ b/lib/EncoderManager.py
@ -0,0 +1,233 @@
+import warnings
+
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+
+
+class EncoderManager:
+    """
+    This class manages the encoding and decoding of a pandas DataFrame using LabelEncoder, OrdinalEncoder,
+    and OneHotEncoder encoders. It stores a copy of the original DataFrame to enable multiple encodings and
+    decodings. The class provides methods to encode/decode the DataFrame in-place or return a new encoded/decoded
+    DataFrame. It also provides methods to access the encoded columns, the encoders used for encoding,
+    and the encoded/decoded DataFrame.
+
+    Parameters:
+    -----------
+    df_: pd.DataFrame
+        The DataFrame to be encoded and decoded.
+    label_cols: tuple
+        The columns to encode with LabelEncoder.
+    ordinal_cols: tuple
+        The columns to encode with OrdinalEncoder.
+    onehot_cols: tuple
+        The columns to encode with OneHotEncoder.
+    """
+
+    def __init__(self, df_: pd.DataFrame, label_cols: tuple, ordinal_cols: tuple, onehot_cols: tuple):
+        """
+        Initialize the EncoderManager instance.
+
+        Parameters:
+        -----------
+        df_: pd.DataFrame
+            The DataFrame to be encoded and decoded.
+        label_cols: tuple
+            The columns to encode with LabelEncoder.
+        ordinal_cols: tuple
+            The columns to encode with OrdinalEncoder.
+        onehot_cols: tuple
+            The columns to encode with OneHotEncoder.
+        """
+        self.df = df_.copy()
+        self.label_cols = label_cols
+        self.ordinal_cols = ordinal_cols
+        self.onehot_cols = onehot_cols
+        self.encoders = {}
+        self._is_encoded = False
+
+        for col_ in self.label_cols:
+            self.encoders[col_] = LabelEncoder()
+
+        for col_ in self.ordinal_cols:
+            self.encoders[col_] = OrdinalEncoder()
+
+        for col_ in self.onehot_cols:
+            self.encoders[col_] = OneHotEncoder(sparse=False)
+
+    def encode(self, inplace: bool = False):
+        """
+        Encode the columns in the DataFrame according to their data types.
+
+        Parameters:
+        -----------
+        inplace: bool
+            Whether to modify the DataFrame in place. If True, the original DataFrame will be modified.
+            If False, a copy of the DataFrame will be returned. Default is False.
+
+        Returns:
+        --------
+        pd.DataFrame or None
+            The encoded DataFrame, or None if the original DataFrame was modified in place.
+        """
+        # disable warnings
+        warnings.filterwarnings("ignore")
+
+        df_ = self.df.copy() if not inplace else self.df
+
+        # prevent multiple encoding
+        if not self._is_encoded:
+            for col_ in self.label_cols:
+                df_[col_] = self.encoders[col_].fit_transform(df_[col_])
+
+            for col_ in self.ordinal_cols:
+                df_[col_] = self.encoders[col_].fit_transform(df_[[col_]])
+
+            for col_ in self.onehot_cols:
+                encoded = self.encoders[col_].fit_transform(df_[[col_]])
+                categories = self.encoders[col_].categories_[0]
+
+                for i_, category in enumerate(categories):
+                    df_[f"{col_}_{category}"] = encoded[:, i_]
+                df_.drop(col_, axis=1, inplace=True)
+
+        # enable warnings
+        warnings.filterwarnings("default")
+
+        if inplace:
+            self._is_encoded = True
+            return self
+        else:
+            return df_
+
+    def decode(self, inplace: bool = False):
+        """
+        Decode the columns in the DataFrame according to their data types.
+
+        Parameters:
+        -----------
+        inplace: bool
+            Whether to modify the DataFrame in place. If True, the original DataFrame will be modified.
+            If False, a copy of the DataFrame will be returned. Defaults to False.
+
+        Returns:
+        --------
+        pd.DataFrame or None
+            The decoded DataFrame, or None if the original DataFrame was modified in place.
+        """
+        # disable warnings
+        warnings.filterwarnings("ignore")
+
+        df_ = self.df.copy() if not inplace else self.df
+
+        # prevent multiple decoding
+        if self._is_encoded:
+            for col_ in self.onehot_cols:
+                categories = self.encoders[col_].categories_[0]
+
+                for category in categories:
+                    df_[col_] = df_.filter(like=f"{col_}_{category}").idxmax(axis=1).str.replace(f"{col_}_", "")
+                df_.drop(df_.filter(like=f"{col_}_").columns, axis=1, inplace=True)
+
+            for col_ in self.ordinal_cols:
+                df_[col_] = self.encoders[col_].inverse_transform(df_[[col_]].astype(int))
+
+            for col_ in self.label_cols:
+                df_[col_] = self.encoders[col_].inverse_transform(df_[col_].astype(int))
+
+        # enable warnings
+        warnings.filterwarnings("default")
+
+        if inplace:
+            self._is_encoded = False
+            return self
+        else:
+            return df_
+
+    def get_df(self, as_copy: bool = False, last_encoded: bool = False):
+        """
+        Get the encoded or decoded DataFrame.
+
+        Parameters:
+        -----------
+        as_copy: bool
+            Whether to return a copy of the DataFrame.
+        last_encoded: bool
+            Whether to return the last encoded boolean. Used to determine if the DataFrame is encoded or not.
+
+        Returns:
+        --------
+        Tuple[pd.DataFrame, bool]
+            A tuple containing the encoded/decoded DataFrame and a boolean indicating if it is encoded or not.
+
+        The boolean is only returned if as_copy is True. If not, it returns only the DataFrame and not
+        the tuple.
+        """
+        if as_copy:
+            if last_encoded:
+                return self.df.copy(), self._is_encoded
+            else:
+                return self.df.copy()
+        else:
+            if last_encoded:
+                return self.df, self._is_encoded
+            else:
+                return self.df
+
+    def encoded_cols(self):
+        """
+        Get the arrays of columns used for encoding.
+
+        Returns:
+        --------
+        Tuple[np.ndarray, np.ndarray, np.ndarray]
+            A tuple containing three arrays of column names: onehot_cols, ordinal_cols, label_cols.
+        """
+        return self.onehot_cols, self.ordinal_cols, self.label_cols
+
+    def normalize(self, method="min-max", inplace=False):
+        """
+        Normalize the specified columns of the dataframe using the specified method.
+        This supports two methods: "min-max" and "z-score".
+
+        Parameters:
+        method (str): Normalization method to use, default "min-max".
+        columns (list): List of columns to normalize. If None, normalize all columns.
+        inplace (bool): If True, the dataframe is modified in place, otherwise a new dataframe is returned.
+
+        Returns:
+        normalized_df (pandas.DataFrame): Normalized dataframe if inplace=False, otherwise None.
+        """
+
+        df_numeric = self.df.select_dtypes(include=["float", "int"])
+
+        if method == "min-max":
+            scaler = MinMaxScaler()
+        elif method == "z-score":
+            scaler = StandardScaler()
+        else:
+            raise ValueError("Invalid normalization method specified.")
+
+        df_normalized = scaler.fit_transform(df_numeric)
+
+        if inplace:
+            self.df[df_numeric.columns] = df_normalized
+            return self
+        else:
+            df_ = self.df.copy()
+            df_[df_numeric] = df_normalized
+            return df_
+
+    def get_encoders(self):
+        """
+        Get the encoders used for encoding.
+
+        Returns:
+            --------
+            Dict[str, object]
+                A dictionary containing the column names and their corresponding encoder object.
+        """
+        return self.encoders
+