{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:02.486093400Z",
"start_time": "2023-06-20T09:46:02.161449400Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import warnings\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.manifold import TSNE\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.tree import DecisionTreeClassifier"
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"data": {
"text/plain": " City Year Sport Discipline Event \\\n0 Montreal 1976.0 Aquatics Diving 3m springboard \n1 Montreal 1976.0 Aquatics Diving 3m springboard \n2 Montreal 1976.0 Aquatics Diving 3m springboard \n3 Montreal 1976.0 Aquatics Diving 3m springboard \n4 Montreal 1976.0 Aquatics Diving 10m platform \n\n Athlete Gender Country_Code Country Event_gender \\\n0 KÖHLER, Christa Women GDR East Germany W \n1 KOSENKOV, Aleksandr Men URS Soviet Union M \n2 BOGGS, Philip George Men USA United States M \n3 CAGNOTTO, Giorgio Franco Men ITA Italy M \n4 WILSON, Deborah Keplar Women USA United States W \n\n Medal \n0 Silver \n1 Bronze \n2 Gold \n3 Silver \n4 Bronze ",
"text/html": "
",
"image/png": ""
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"warnings.filterwarnings('ignore')\n",
"msno.heatmap(df)\n",
"warnings.filterwarnings('default')"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.067628400Z",
"start_time": "2023-06-20T09:46:04.681718300Z"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": " City Year Sport Discipline Event Athlete Gender Country_Code Country \\\n770 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n771 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n772 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n773 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n774 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n.. ... ... ... ... ... ... ... ... ... \n882 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n883 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n884 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n885 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n886 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n\n Event_gender Medal \n770 NaN NaN \n771 NaN NaN \n772 NaN NaN \n773 NaN NaN \n774 NaN NaN \n.. ... ... \n882 NaN NaN \n883 NaN NaN \n884 NaN NaN \n885 NaN NaN \n886 NaN NaN \n\n[117 rows x 11 columns]",
"text/html": "
"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# verify if there are outliners defined as out of 3 standard deviation\n",
"numerical_cols = df.select_dtypes(include=['float64', 'int64'])\n",
"\n",
"means = numerical_cols.mean()\n",
"stds = numerical_cols.std()\n",
"\n",
"# set the threshold to 3 standard deviations\n",
"threshold = 3\n",
"\n",
"outliers = pd.DataFrame()\n",
"\n",
"for col in numerical_cols.columns:\n",
" col_outliers = df[(df[col] < means[col] - threshold * stds[col]) |\n",
" (df[col] > means[col] + threshold * stds[col])]\n",
" col_outliers['feature'] = col\n",
" outliers = pd.concat([outliers, col_outliers])\n",
"\n",
"outliers.head()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.225119700Z",
"start_time": "2023-06-20T09:46:05.146554900Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": "array([[]], dtype=object)"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAGvCAYAAAB4u44CAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAxZUlEQVR4nO3de1xVdb7/8TcbULY3QDG1Tk5HuVReEmnwmqVn9sPHaEoBho+YjtrRJiK7HbS8lE48ND1N5ThzYkotctRs0JiJDmn1GCszITOmsU4Yu5rEMW8oDmwhbuv3Rz/3aSeIwMb93fB6Ph77D7+ftdb+ri/f7+rdXvsSYFmWJQAAAIPZfN0BAACA5hBYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAtLuMjAzFxMTo/fffb7S+e/duxcTEaPXq1Ze4ZwD8RQBfzQ+gvZWXl+vmm29W165d9frrr8tut7trLpdL06ZNU7du3fTqq6+qS5cuPuwpAFPxCguAdhcWFqbly5fr8OHDWrNmjUftqaee0vHjx/Xkk08SVgA0icAC4JL42c9+pptvvll/+MMfdODAAUnSxx9/rC1btujee+/VNddcoyNHjuihhx5SfHy8rrvuOs2aNUv/+7//63Gcw4cPa+HChRo/fryGDBmiMWPGaOHChTp9+rR7m0mTJmnlypWaNWuWRo4cqccee+ySnisA7+OWEIBL5vTp07r55pt1+eWXa8uWLUpKSpLdbteWLVt05swZ3XLLLbLb7br33ntlt9v10ksv6dNPP9W2bds0ePBgVVVVaerUqQoPD9fdd9+tnj17av/+/frv//5vJSUlKTMzU9L3geXYsWNKTU3VjTfeqJCQEMXFxfn47AG0RZCvOwCg8wgPD9fy5ct177336s4779ShQ4f0pz/9SYGBgXrppZdUXl6ul19+WVdccYUkacKECZoyZYp+85vfaO3atfr73/+u/v37a9WqVRo4cKAkafTo0Tpw4IA+/PBDj+e67LLL9Mgjj8hm44VkoCMgsAC4pBwOh6ZMmaL8/Hw99thjuuqqqyRJe/fu1TXXXKN+/fqprq5OkmSz2TRhwgS99tprkqRrrrlGW7ZsUUNDg0pLS/X3v/9dJSUl+uqrr9z7nDN48GDCCtCBEFgAXHI33HCD8vPzdeONN7rbysvL9c0332jIkCGN7lNVVSW73a4XX3xRzz33nE6fPq2IiAgNGTJEdrtdFRUVHttHRES06zkAuLQILACM0LNnT8XHx2vhwoWN1rt06aK8vDytWrVK//mf/6nk5GT17t1bknT//fe738gLoGMisAAwQnx8vPLy8vSv//qv6tGjh7t9xYoV+u677/T4449r//796tmzp+666y533eVyaf/+/QoK4nIGdGTc4AVghNmzZ6uhoUGzZ89Wfn6+9u7dq0cffVQbN27UoEGDJEnDhw9XRUWFVq1apcLCQuXl5Sk1NVUnT55UVVWVj88AQHvif0kAGKFfv37aunWrnnrqKS1fvlzfffedrrrqKq1YsULJycmSpFtvvVWHDx/W9u3btWXLFvXr10833nijbr/9dj366KNyOp2KjIz08ZkAaA98DwsAADAet4QAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAON1uG+6LSurUGNfhRcQIPXp07PJemfG2DSOcWkaY9M4xqVxjEvTGJv/G4PmdLjAYlm64B+9uXpnxtg0jnFpGmPTOMalcYxL0xib5nFLCAAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxOtyvNQMAYDqbLUA2W4D734GB5r9+0NBgqaHBdz8pTWABAOASstkCFBrWTUE/CCnh4d192KOLU1ffoDPlZ30WWggsAABcQjZbgIICbbp/a5Gcxyt93Z2LEnlZD/1mZqxstgACCwAAnYnzeKU+O/JPX3fDb5h/0wwAAHR6LQosxcXFmjNnjuLj4zVu3DgtXLhQp06dkiR98sknmjFjhmJjYzVp0iTl5OR47JubmyuHw6ERI0YoMTFRRUVF7lp9fb1Wr16tsWPHKjY2VmlpaTp+/LgXTg8A0BI2W4CCgmxefZx7Q2lgoHePe+7xwzevouO66FtC1dXVmjt3rm677TY999xzcrlcevjhh7V48WKtXr1ad911l+677z6lpKRo3759Sk9PV0xMjIYPH67CwkJlZmZq3bp1Gj58uDZv3qy0tDTt2rVLdrtdWVlZ2rNnj7Zv366ePXvq0Ucf1dKlS/X888+357kDAH6gsTeDelN7vbHU128GxaVx0YHlyJEjuvrqq5Wenq7AwEB16dJFKSkpWrhwod58802FhYUpNTVVkjRmzBhNmzZNmzdv1vDhw5WTk6OpU6cqLi5OkjR79my98sorys/PV1JSknJycpSRkaEBAwZIkpYsWaLx48ertLRUV155ZTucNgDgx3gzKEx20YFl0KBBWr9+vUfbzp07NWTIEJWUlCg6OtqjFhkZqW3btkmSnE6nkpKSzqsXFxeroqJCR48e9dg/IiJCoaGhOnjwYIsDS0ATrwyea2+q3pkxNo1jXJrG2DSuo4yLP74Z1B++x+Qcf+prY7w9vy/2eK36lJBlWVqzZo127dqlTZs2aePGjbLb7R7bhISE6OzZs5Ikl8vVZN3lckmSunXrdl79XK0l+vTp2aZ6Z8bYNI5xaRpj0zjG5dLp26Or6hss9eplb35jtJkvvy+mxYGlsrJSixYt0meffaZNmzYpJiZGdrtdFRUVHttVV1ere/fvT8xut6u6uvq8enh4uDvIVFVVNbl/S5SVVchq5FXBgIDvLyJN1TszxqZxjEvTGJvG+fu4BAba/OILzH6olz1IgbYAv7qNdVNMXy2YfLWvu9Eqp0+7VF/f4NVjnls3zWlRYDl06JDmzZunyy+/XNu2bVPv3r0lSdHR0dqzZ4/Htk6nU1FRUZKkqKgolZSUnFefMGGCQkND1a9fPzmdTvdtoRMnTqi8vPy820wXw7J0wQtFc/XOjLFpHOPSNMamcYzLpedPt7EG9/WvUPhjvprbF30j7cyZM5o1a5ZGjhypDRs2uMOKJDkcDp08eVLZ2dmqra1VQUGB8vLy3O9bSU5OVl5engoKClRbW6vs7GyVlZXJ4XBIkhITE5WVlaXS0lJVVlZq5cqVio+P18CBA718ugAAwB9d9Cssr776qo4cOaI33nhDO3bs8KgVFRXphRde0IoVK7R27Vr17t1bS5cu1ejRoyV9/6mhZcuWafny5Tp27JgiIyO1bt06hYWFSZLS09NVV1en1NRUuVwujRo1SmvWrPHaSQIAAP920YFlzpw5mjNnTpP1YcOGaevWrU3WExISlJCQ0GgtODhYGRkZysjIuNjuAACATsS/P1sFAAA6BQILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYLwgX3cAADoqmy1ANluAr7tx0QID+X9YmIvAAgDtwGYLUGhYNwURAgCvILAAQDuw2QIUFGjT/VuL5Dxe6evuXJSbYvpqweSrfd0NoFEEFgBoR87jlfrsyD993Y2LMrhvd193AWgSr1UCAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYLxWB5ZTp07J4XCosLBQkvTYY48pNjbW43HNNdfoP/7jP9z7/PznP9d1113nsc2XX34pSaqvr9fq1as1duxYxcbGKi0tTcePH2/j6QEAgI6gVYFl//79SklJ0aFDh9xtjz/+uIqKityP3/72t+rVq5ceeeQRSVJlZaW+/vpr5efne2w3ePBgSVJWVpb27Nmj7du3a/fu3QoJCdHSpUu9cIoAAMDftTiw5ObmKiMjQw8++GCT25w6dUoZGRlasmSJoqKiJEmffvqpwsLCdMUVVzS6T05OjubNm6cBAwaoR48eWrJkid577z2Vlpa2tIsAAKCDCWrpDuPHj9e0adMUFBTUZGj59a9/raFDh2r69OnutgMHDshut+sXv/iFSkpKdMUVV2j+/PmaOHGiKioqdPToUUVHR7u3j4iIUGhoqA4ePKgrr7zyovsXEHDh9qbqnRlj0zjGpWmMTeMYF3QG3p7fF3u8FgeWvn37XrBeWlqq1157TTk5OT/qUICGDRumhx56SJdffrl27Nih+fPna9OmTerfv78kqVu3bh77hISEyOVytah/ffr0bFO9M2NsGse4NI2xaRzjgo4qPLy7z567xYGlOdu3b3e/4faH5s6d6/Hv6dOn6/XXX9fOnTt19913S5Kqqqo8tqmurlb37i0bnLKyClnW+e0BAd9fRJqqd2aMTeMYl6YxNo374bjYbDafXtyB9nD6tEv19Q1ePea5ddMcrweWN998U3feeed57Rs2bNC1116rMWPGuNtqamrUtWtXhYaGql+/fnI6ne7bQidOnFB5ebnHbaKLYVm64AW0uXpnxtg0jnFpGmPTOMYEHZmv5rdXv4fl9OnT+vLLL/XTn/70vNq3336rX/3qVyotLVVdXZ22bdumoqIi3XrrrZKkxMREZWVlqbS0VJWVlVq5cqXi4+M1cOBAb3YRAAD4Ia++wnL48GFJUr9+/c6rLVy4UDabTbfffrsqKioUGRmp559/Xj/5yU8kSenp6aqrq1NqaqpcLpdGjRqlNWvWeLN7AADAT7UpsBw8eNDj38OGDTuv7ZwuXbpo8eLFWrx4caP14OBgZWRkKCMjoy1dAgAAHZDX38MCAO3BZguQzeYfnxcODLQpMJBfPgG8icACwHg2W4BCw7opyE9CAJ8OAryPwALAeDZbgIICbbp/a5Gcxyt93Z2LclNMXy2YfLWvuwF0GAQWAH7DebxSnx35p6+7cVEG9+VVFsCb/OP1VQAA0KkRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYL8nUHAH9nswXIZgvwdTdapKHBUkOD5etuAMBFI7AAbWCzBSg0rJuCAv3rxcq6+gadKT9LaAHgNwgsQBvYbAEKCrTp/q1Fch6v9HV3LkrkZT30m5mxstkCCCwA/AaBBfAC5/FKfXbkn77uBgB0WP71OjYAAOiUWh1YTp06JYfDocLCQnfbsmXLNHToUMXGxrofr7zyiruem5srh8OhESNGKDExUUVFRe5afX29Vq9erbFjxyo2NlZpaWk6fvx4a7sHAAA6kFYFlv379yslJUWHDh3yaD9w4IAyMzNVVFTkfqSkpEiSCgsLlZmZqVWrVmnfvn2aPn260tLSVFVVJUnKysrSnj17tH37du3evVshISFaunRpG08PAAB0BC0OLLm5ucrIyNCDDz7o0V5TU6MvvvhCQ4cObXS/nJwcTZ06VXFxcQoODtbs2bMVHh6u/Px8d33evHkaMGCAevTooSVLlui9995TaWlpK04LAAB0JC0OLOPHj9dbb72lKVOmeLQXFxerrq5Oa9eu1dixYzV58mQ9//zzamhokCQ5nU5FR0d77BMZGani4mJVVFTo6NGjHvWIiAiFhobq4MGDLepfQEDTj+bqnfnB2LRuXPydv8wZAObw1Rpv8aeE+vbt22h7RUWF4uPjdccdd+jpp5/W559/rvT0dNlsNs2dO1cul0t2u91jn5CQEJ09e1Yul0uS1K1bt/Pq52oXq0+fnm2qd2aMTeM66riEh3dv8zE66tgAaJw3rhut5bWPNY8bN07jxo1z/3v48OGaNWuW8vPzNXfuXNntdlVXV3vsU11drfDwcHeQOfd+lh/Wu3dv2eCUlVXIauSrJQICvr+4NlXvzBibxl3MuAQG2ny6gNvi9GmX6usbWrXvpZ4z/jzOQEfSlutGU85dT5rjtcDy9ttv6+TJk5o5c6a7raamRiEhIZKkqKgolZSUeOzjdDo1YcIEhYaGql+/fh63jU6cOKHy8vLzbiM1x7J0wQtoc/XOjLFpXEcel7aeV0ceGwCN89Wa99r3sFiWpSeeeEJ79+6VZVkqKirSxo0b3Z8SSk5OVl5engoKClRbW6vs7GyVlZXJ4XBIkhITE5WVlaXS0lJVVlZq5cqVio+P18CBA73VRQAA4Ke89gqLw+HQokWLtHz5ch07dkwRERGaP3++EhISJEljxozRsmXL3PXIyEitW7dOYWFhkqT09HTV1dUpNTVVLpdLo0aN0po1a7zVPQAA4MfaFFh+/AmemTNnetwS+rGEhAR3gPmx4OBgZWRkKCMjoy1dAgAAHRC/JQRj2GwBstnM+wxr4AV+iflCNQCA9xBYYASbLUChYd0UZGAA4NMpAOB7BBYYwWYLUFCgTfdvLZLzeKWvu3PRborpqwWTr/Z1NwCgwyOwwCjO45X67Mg/fd2Niza4L6++AMClYN7r7wAAAD9CYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI8vjuugWvO7PL78XRx+kwcAcCEElg6otb/Lw2/mAABMRWDpgPzxd3n4TR4AwIUQWDowf/pdHn6TBwBwIbxxAAAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMx1fzX4TW/PKxL/HLxwCAjobA0ozW/vIxAADwHgJLM/jlYwAAfI/AcpH45WMAAHyH+xwAAMB4BBYAAGA8AgsAADBeqwPLqVOn5HA4VFhY6G7buXOnEhISNHLkSE2aNEm/+93v1NDQ4K7//Oc/13XXXafY2Fj348svv5Qk1dfXa/Xq1Ro7dqxiY2OVlpam48ePt+HUAABAR9GqwLJ//36lpKTo0KFD7rZPP/1UCxcu1AMPPKCPPvpI69at06uvvqrs7GxJUmVlpb7++mvl5+erqKjI/Rg8eLAkKSsrS3v27NH27du1e/duhYSEaOnSpW0/QwAA4PdaHFhyc3OVkZGhBx980KP9H//4h2bOnKmJEyfKZrNp8ODBcjgc2rdvn6TvA01YWJiuuOKKRo+bk5OjefPmacCAAerRo4eWLFmi9957T6Wlpa04LQAA0JG0OLCMHz9eb731lqZMmeLRPnnyZC1atMj97+rqar3zzjsaMmSIJOnAgQOy2+36xS9+oVGjRikxMVG7du2SJFVUVOjo0aOKjo527x8REaHQ0FAdPHiwRf0LCGj60Vy9qX2Ajqql66Gt66ktzwXADL5a4y3+Hpa+ffs2u01lZaXuv/9+hYSEaPbs2f//BAM0bNgwPfTQQ7r88su1Y8cOzZ8/X5s2bVL//v0lSd26dfM4TkhIiFwuV4v616dPzzbVgc4iPLzt39fDegI6F29cN1rL618c99VXX+m+++5Tnz59tHHjRvXo0UOSNHfuXI/tpk+frtdff107d+7U3XffLUmqqqry2Ka6ulrdu7dscMrKKmRZ57cHBHx/cW2q3pTAQJtP/0BAezl92qX6+obmN2xEa9dTa7EOATO05brRlHPXk+Z49WPN7777rmbMmKEbbrhBGzZsUGhoqLu2YcMG7d2712P7mpoade3aVaGhoerXr5+cTqe7duLECZWXl3vcJroYltX0o7l6U/sAHVVL10Nb11NbnguAGXy1xr0WWP76178qPT1dixYt0sMPP6ygIM8Xb7799lv96le/Umlpqerq6rRt2zYVFRXp1ltvlSQlJiYqKytLpaWlqqys1MqVKxUfH6+BAwd6q4sAAMBPee2W0O9//3vV1dVpxYoVWrFihbs9Li5O69ev18KFC2Wz2XT77beroqJCkZGRev755/WTn/xEkpSenq66ujqlpqbK5XJp1KhRWrNmjbe6BwAA/FibAssPP8Hz+9///oLbdunSRYsXL9bixYsbrQcHBysjI0MZGRlt6RIAAOiA+Gp+AABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGC8VgeWU6dOyeFwqLCw0N32ySefaMaMGYqNjdWkSZOUk5PjsU9ubq4cDodGjBihxMREFRUVuWv19fVavXq1xo4dq9jYWKWlpen48eOt7R4AAOhAWhVY9u/fr5SUFB06dMjddubMGd1111265ZZbtG/fPq1YsUJPPPGE/va3v0mSCgsLlZmZqVWrVmnfvn2aPn260tLSVFVVJUnKysrSnj17tH37du3evVshISFaunSpF04RAAD4uxYHltzcXGVkZOjBBx/0aH/zzTcVFham1NRUBQUFacyYMZo2bZo2b94sScrJydHUqVMVFxen4OBgzZ49W+Hh4crPz3fX582bpwEDBqhHjx5asmSJ3nvvPZWWlnrhNAEAgD9rcWAZP3683nrrLU2ZMsWjvaSkRNHR0R5tkZGRKi4uliQ5nc4m6xUVFTp69KhHPSIiQqGhoTp48GCL+hcQ0PSjuXpT+wAdVUvXQ1vXU1ueC4AZfLXGg1ra0b59+zba7nK5ZLfbPdpCQkJ09uzZZusul0uS1K1bt/Pq52oXq0+fnm2qA51FeHj3Nh+D9QR0Lt64brRWiwNLU+x2uyoqKjzaqqur1b17d3e9urr6vHp4eLg7yJx7P0tj+1+ssrIKWdb57QEB319cm6o3JTDQ5tM/ENBeTp92qb6+oVX7tnY9tRbrEDBDW64bTTl3PWmO1z7WHB0drZKSEo82p9OpqKgoSVJUVFST9dDQUPXr109Op9NdO3HihMrLy8+7jdQcy2r60Vy9qX2Ajqql66Gt66ktzwXADL5a414LLA6HQydPnlR2drZqa2tVUFCgvLw8JSUlSZKSk5OVl5engoIC1dbWKjs7W2VlZXI4HJKkxMREZWVlqbS0VJWVlVq5cqXi4+M1cOBAb3URAAD4Ka/dEgoPD9cLL7ygFStWaO3aterdu7eWLl2q0aNHS5LGjBmjZcuWafny5Tp27JgiIyO1bt06hYWFSZLS09NVV1en1NRUuVwujRo1SmvWrPFW9wAAgB9rU2D58Sd4hg0bpq1btza5fUJCghISEhqtBQcHKyMjQxkZGW3pEgAA6ID4an4AAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYLwgbx3otdde07JlyzzaamtrJUmffvqpli1bpu3btys4ONhdf+SRR5SSkiJJys3N1bPPPqsTJ05o0KBBevTRRxUbG+ut7gEAAD/mtcAyffp0TZ8+3f3vY8eOKSkpSQsWLJAkHThwQJmZmbr11lvP27ewsFCZmZlat26dhg8frs2bNystLU27du2S3W73VhcBAICfapdbQpZlacGCBbrpppuUkJCgmpoaffHFFxo6dGij2+fk5Gjq1KmKi4tTcHCwZs+erfDwcOXn57dH9wAAgJ/x2issP/TnP/9ZTqdTzz77rCSpuLhYdXV1Wrt2rfbv36+ePXsqKSlJc+fOlc1mk9PpVFJSkscxIiMjVVxc3OLnDgi4cHtTdaAzau16YD0BnZe31/3FHs/rgaWhoUFZWVm6++671aNHD0lSRUWF4uPjdccdd+jpp5/W559/rvT0dNlsNs2dO1cul+u8Wz8hISE6e/Zsi5+/T5+ebaoDnUV4ePc2H4P1BHQu3rhutJbXA0thYaGOHz+u5ORkd9u4ceM0btw497+HDx+uWbNmKT8/X3PnzpXdbld1dbXHcaqrqxUeHt7i5y8rq5Blnd8eEPD9xbWpelMCA20+/QMB7eX0aZfq6xtatW9r11NrsQ4BM7TlutGUc9eT5ng9sOzcuVMOh0PdunVzt7399ts6efKkZs6c6W6rqalRSEiIJCkqKkolJSUex3E6nZowYUKLn9+ydMELaHN1oDNp61pgPQGdj6/WvNffdLt//3799Kc/9WizLEtPPPGE9u7dK8uyVFRUpI0bN7o/0pycnKy8vDwVFBSotrZW2dnZKisrk8Ph8Hb3AACAH/L6KyyHDx/WZZdd5tHmcDi0aNEiLV++XMeOHVNERITmz5+vhIQESdKYMWO0bNkydz0yMlLr1q1TWFiYt7sHAAD8kNcDS1FRUaPtM2fO9Lgl9GMJCQnuAAMAAPBDfDU/AAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADCeVwNLfn6+rr32WsXGxrofCxYskCR98sknmjFjhmJjYzVp0iTl5OR47JubmyuHw6ERI0YoMTFRRUVF3uwaAADwY0HePNiBAweUkJCgJ554wqP9zJkzuuuuu3TfffcpJSVF+/btU3p6umJiYjR8+HAVFhYqMzNT69at0/Dhw7V582alpaVp165dstvt3uwiAADwQ159heXAgQMaOnToee1vvvmmwsLClJqaqqCgII0ZM0bTpk3T5s2bJUk5OTmaOnWq4uLiFBwcrNmzZys8PFz5+fne7B4AAPBTXgssDQ0N+uyzz/TOO+9o4sSJmjBhgh599FGdOXNGJSUlio6O9tg+MjJSxcXFkiSn03nBeksEBDT9aK7e1D5AR9XS9dDW9dSW5wJgBl+tca/dEjp16pSuvfZaTZ48WWvXrtXp06f18MMPa8GCBerbt+95t3ZCQkJ09uxZSZLL5bpgvSX69OnZpjrQWYSHd2/zMVhPQOfijetGa3ktsERERLhv8UiS3W7XggULdNtttykxMVHV1dUe21dXV6t79+7ubRurh4eHt7gfZWUVsqzz2wMCvr+4NlVvSmCgzad/IKC9nD7tUn19Q6v2be16ai3WIWCGtlw3mnLuetIcr90SKi4u1q9//WtZP7h61dTUyGazafjw4SopKfHY3ul0KioqSpIUFRV1wXpLWFbTj+bqTe0DdFQtXQ9tXU9teS4AZvDVGvdaYAkLC9PmzZu1fv161dXV6ciRI3ryySd16623avLkyTp58qSys7NVW1urgoIC5eXlKSkpSZKUnJysvLw8FRQUqLa2VtnZ2SorK5PD4fBW9wAAgB/z2i2h/v3767nnntPTTz+trKwsde3aVVOnTtWCBQvUtWtXvfDCC1qxYoXWrl2r3r17a+nSpRo9erQkacyYMVq2bJmWL1+uY8eOKTIyUuvWrVNYWJi3ugcAAPyYV7+HJT4+Xlu3bm20NmzYsCZrkpSQkKCEhARvdgcAAHQQfDU/AAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADCeVwNLcXGx5syZo/j4eI0bN04LFy7UqVOnJEnLli3T0KFDFRsb63688sor7n1zc3PlcDg0YsQIJSYmqqioyJtdAwAAfsxrgaW6ulpz585VbGys3n//fb3++usqLy/X4sWLJUkHDhxQZmamioqK3I+UlBRJUmFhoTIzM7Vq1Srt27dP06dPV1pamqqqqrzVPQAA4Me8FliOHDmiq6++Wunp6erSpYvCw8OVkpKiffv2qaamRl988YWGDh3a6L45OTmaOnWq4uLiFBwcrNmzZys8PFz5+fne6h4AAPBjXgssgwYN0vr16xUYGOhu27lzp4YMGaLi4mLV1dVp7dq1Gjt2rCZPnqznn39eDQ0NkiSn06no6GiP40VGRqq4uLjF/QgIaPrRXL2pfYCOqqXroa3rqS3PBcAMvlrjQe1xMpZlac2aNdq1a5c2bdqkkydPKj4+XnfccYeefvppff7550pPT5fNZtPcuXPlcrlkt9s9jhESEqKzZ8+2+Ln79OnZpjrQWYSHd2/zMVhPQOfijetGa3k9sFRWVmrRokX67LPPtGnTJsXExCgmJkbjxo1zbzN8+HDNmjVL+fn5mjt3rux2u6qrqz2OU11drfDw8BY/f1lZhSzr/PaAgO8vrk3VmxIYaPPpHwhoL6dPu1Rf39CqfVu7nlqLdQiYoS3Xjaacu540x6ufEjp06JCSkpJUWVmpbdu2KSYmRpL09ttva+vWrR7b1tTUKCQkRJIUFRWlkpISj7rT6VRUVFSL+2BZTT+aqze1D9BRtXQ9tHU9teW5AJjBV2vca4HlzJkzmjVrlkaOHKkNGzaod+/ePzg5S0888YT27t0ry7JUVFSkjRs3uj8llJycrLy8PBUUFKi2tlbZ2dkqKyuTw+HwVvcAAIAf89otoVdffVVHjhzRG2+8oR07dnjUioqKtGjRIi1fvlzHjh1TRESE5s+fr4SEBEnSmDFjtGzZMnc9MjJS69atU1hYmLe6BwAA/JjXAsucOXM0Z86cJuszZ87UzJkzm6wnJCS4AwwAAMAP8dX8AADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYDwCCwAAMB6BBQAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4RgWWsrIy3XPPPbr++us1atQorVixQnV1db7uFgAA8DGjAssDDzygbt26affu3dq2bZv27t2r7OxsX3cLAAD4mDGB5ZtvvtGHH36oBQsWyG6368orr9Q999yjzZs3+7prAADAx4J83YFzSkpKFBYWpn79+rnbBg8erCNHjuif//ynevXqdVHHsdkkyzq/PSDgwvXmDLm8l+xdAlu+ow8M7ttDEn2+FPyx34MiukuSAgNb//8r59ZTUJCtVeuppc711Z/G2R/nBn2+NPyxz+euG9L3/x31pnPXk2a3s6xLcblp3p///Gc988wzeuedd9xthw4dksPh0Lvvvqv+/fv7rnMAAMCnjLkl1K1bN1VVVXm0nft39+7dG9sFAAB0EsYElqioKJWXl+vkyZPuti+//FL9+/dXz549fdgzAADga8YElquuukpxcXFauXKlKisrVVpaqmeffVbJycm+7hoAAPAxY97DIkknT57U448/rsLCQtlsNt1yyy3KyMhQYKB/vCkJAAC0D6MCCwAAQGOMuSUEAADQFAILAAAwHoEFAAAYj8ACAACMR2ABAADG8+vAcurUKTkcDhUWFrrb3n33Xd1yyy2KjY3V9OnT9dZbb7lrU6dOVWxsrMcjJiZGzz33nKTvP1YdExPjUZ80adIlPy9vaOnYNDQ06JlnntGECRMUFxen2267TR9++KG7Xl9fr9WrV2vs2LGKjY1VWlqajh8/fknPyRu8PS6dec5YlqV169Zp0qRJGjlypGbPnq0vvvjCXe+sc6a5cfH3OVNcXKw5c+YoPj5e48aN08KFC3Xq1ClJ0ieffKIZM2a4zyknJ8dj39zcXDkcDo0YMUKJiYkqKipy1zrCfGmvsfH3OeM1lp/66KOPrJ/97GdWdHS0VVBQYFmWZX366afWkCFDrD/+8Y9WbW2ttW/fPis2NtZd/7FnnnnGSkhIsCorKy3Lsqy//OUv1sSJEy/ZObSX1ozN5s2brSlTplhHjx616uvrrRdffNEaMWKEVV1dbVmWZf32t7+1pk2bZh05csSqqKiwHnjgAWvevHk+O8fWaI9x6cxz5qWXXrLi4+Ot/fv3W7W1tdbGjRutUaNGWWVlZZZldd4509y4+POcqaqqssaNG2f95je/sb777jvr1KlT1rx586xf/vKXVnl5uRUfH29t2rTJqq2ttT744AMrNjbW+uSTTyzLsqyCggIrNjbW+uijj6yamhrrxRdftEaNGmWdPXvWsiz/ny/tOTb+PGe8yS9fYcnNzVVGRoYefPBBj/Y33nhDI0eO1IwZMxQUFKTrr79e06ZN08svv3zeMQoKCvTSSy9pzZo17t8qOnDggIYOHXpJzqG9tHZsvvrqKzU0NKihoUGWZSkgIEAhISHu/XNycjRv3jwNGDBAPXr00JIlS/Tee++ptLT0kp5fa7XXuHTmOfP666/rjjvu0MiRIxUUFKQ77rhD4eHh2rFjh6TOO2eaGxd/njNHjhzR1VdfrfT0dHXp0kXh4eFKSUnRvn379OabbyosLEypqakKCgrSmDFjNG3aNG3evFnS9/Nh6tSpiouLU3BwsGbPnq3w8HDl5+e76/48X9pzbPx5zniTXwaW8ePH66233tKUKVM82uvr69WtWzePNpvNpq+++uq87ZYtW6a0tDRdddVV7vYDBw7o6NGjuvnmmzV69GjNmzdPTqez3c6jPbR2bGbOnKnq6mrddNNNGjZsmNasWaO1a9eqa9euqqio0NGjRxUdHe3eNyIiQqGhoTp48GD7n5QXtMe4SJ17zlyo3pnnTHN1f54zgwYN0vr16z2+fXznzp0aMmSISkpKPP7ekhQZGani4mJJktPpbLLeEeZLe42N5N9zxpv8MrD07dtXQUFB57U7HA69//772rlzp+rq6rR//37l5+fru+++89guLy9PZ8+e1b//+797tPfq1UtxcXHauHGj3n77bV111VWaM2eOKioq2vV8vKm1Y1NbW6v4+Hi98cYb+vjjjzV37lzdd999OnHihFwulySddxEOCQlx10zXHuMide45M3nyZP3hD3/Q559/rtraWr388sv6+uuv9d1333XqOXOhcZE6xpyRvn+vzjPPPKNdu3ZpyZIlcrlcstvtHtuEhITo7NmzknTBekeYLz/kzbGROs6caSu/DCxNGTlypP7rv/5Lv/vd7zRu3Dht2LBBiYmJ6tWrl8d2f/zjH5WSkuLx0r4kPfXUU3r44YfVu3dv9ejRQ4sWLZLL5dJHH310KU+jXTQ3NgsXLtSECRM0aNAghYSEKD09XT179tSOHTvcC6mqqsrjmNXV1e7baf6qLeMide45c+edd+qWW25Renq6Jk6cqK+++krjx49Xr169OvWcudC4SB1jzlRWVuq+++5TXl6eNm3apJiYGNntdlVXV3ts98O/94XqHWm+eHtspI4xZ7yhQwWW8vJyRUVFKS8vT4WFhXr22Wf17bffetz7O3nypD7++GMlJCR47FtZWanVq1frH//4h7utvr5edXV15wUbf9Tc2Bw5ckQ1NTUe+wQFBSk4OFihoaHq16+fx0uQJ06cUHl5+XkvY/qbtoxLZ58zx44dU3Jysv7yl7/o/fff18MPP6zi4mINHTq0U8+ZC41LR5gzhw4dUlJSkiorK7Vt2zbFxMRIkqKjo1VSUuKxrdPpVFRUlCQpKiqqyXpHmS/tMTYdYc54S4cKLN98841uu+02FRcXq66uTvn5+dq1a5duv/129zYff/yxLrvsMl155ZUe+/bo0UMffPCBVq9erYqKCrlcLmVmZupf/uVfdP3111/qU/G65sZm0qRJysrKUmlpqWpra/XSSy/pxIkTmjhxoiQpMTHRXa+srNTKlSsVHx+vgQMH+vK02qwt49LZ58z//M//6J577tHp06flcrn01FNPqUuXLu6PW3bWOXOhcfH3OXPmzBnNmjVLI0eO1IYNG9S7d293zeFw6OTJk8rOzlZtba0KCgqUl5enpKQkSVJycrLy8vJUUFCg2tpaZWdnq6ysTA6HQ5L/z5f2Ght/nzNe5dsPKbXdDz9uaFmWtWXLFmvixInWiBEjrMTEROuDDz7w2H79+vVWcnJyo8c6fPiwlZ6ebsXHx1uxsbHW3XffbR0+fLhd+9+eWjI2lZWVVmZmpnXDDTdY119/vZWamur+yJ1lWVZNTY315JNPWjfccIM1cuRIKy0tzTp58uQlPR9v8ea4dOY5U1NTYy1fvtwaPXq0FRcXZ/3yl7+0SktLPeqdcc40Ny7+PGdeeOEFKzo62rruuuusESNGeDwsy7L+9re/WSkpKVZsbKz1b//2b9b27ds99v/Tn/5kTZ482RoxYoSVnJxs/fWvf3XX/H2+tOfY+POc8aYAy7IsX4cmAACAC+lQt4QAAEDHRGABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOP9P/NQ2gOMKN0JAAAAAElFTkSuQmCC"
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# check distribution\n",
"df_numeric_ = df.select_dtypes(include=[np.number])\n",
"df_numeric_.hist()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:47:41.224385500Z",
"start_time": "2023-06-20T09:47:41.114550700Z"
}
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"ename": "ValueError",
"evalue": "No variables found for grid columns.",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[16], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;66;03m# check relation with df pairplot\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m \u001B[43msns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mpairplot\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mvars\u001B[39;49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mnumerical_cols\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[43m[\u001B[49m\u001B[43m:\u001B[49m\u001B[38;5;241;43m-\u001B[39;49m\u001B[38;5;241;43m1\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mhue\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mGender\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\seaborn\\axisgrid.py:2114\u001B[0m, in \u001B[0;36mpairplot\u001B[1;34m(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, corner, dropna, plot_kws, diag_kws, grid_kws, size)\u001B[0m\n\u001B[0;32m 2112\u001B[0m \u001B[38;5;66;03m# Set up the PairGrid\u001B[39;00m\n\u001B[0;32m 2113\u001B[0m grid_kws\u001B[38;5;241m.\u001B[39msetdefault(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mdiag_sharey\u001B[39m\u001B[38;5;124m\"\u001B[39m, diag_kind \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhist\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m-> 2114\u001B[0m grid \u001B[38;5;241m=\u001B[39m \u001B[43mPairGrid\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mvars\u001B[39;49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mvars\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mx_vars\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mx_vars\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43my_vars\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43my_vars\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mhue\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhue\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 2115\u001B[0m \u001B[43m \u001B[49m\u001B[43mhue_order\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhue_order\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mpalette\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mpalette\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcorner\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mcorner\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 2116\u001B[0m \u001B[43m \u001B[49m\u001B[43mheight\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mheight\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43maspect\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43maspect\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdropna\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdropna\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mgrid_kws\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 2118\u001B[0m \u001B[38;5;66;03m# Add the markers here as PairGrid has figured out how many levels of the\u001B[39;00m\n\u001B[0;32m 2119\u001B[0m \u001B[38;5;66;03m# hue variable are needed and we don't want to duplicate that process\u001B[39;00m\n\u001B[0;32m 2120\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m markers \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n",
"File \u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\seaborn\\axisgrid.py:1266\u001B[0m, in \u001B[0;36mPairGrid.__init__\u001B[1;34m(self, data, hue, vars, x_vars, y_vars, hue_order, palette, hue_kws, corner, diag_sharey, height, aspect, layout_pad, despine, dropna)\u001B[0m\n\u001B[0;32m 1263\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39msquare_grid \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mx_vars \u001B[38;5;241m==\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39my_vars\n\u001B[0;32m 1265\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m x_vars:\n\u001B[1;32m-> 1266\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNo variables found for grid columns.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 1267\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m y_vars:\n\u001B[0;32m 1268\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNo variables found for grid rows.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
"\u001B[1;31mValueError\u001B[0m: No variables found for grid columns."
]
}
],
"source": [
"# check relation with df pairplot\n",
"sns.pairplot(df, vars=numerical_cols.columns[:-1], hue=\"Gender\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.364587500Z",
"start_time": "2023-06-20T09:46:05.162190600Z"
}
}
},
{
"cell_type": "raw",
"source": [
"The best data would be to have already an information on how 2 persons are vewing each other and try to infere the missings. But in reality it would be harder have that and we could try another approach.\n",
"\n",
"We could propose how similar one person is to another and then propose a % of match.\n",
"\n",
"We only have the biological gender and not the sexual orientation, we would greatly increase the precision of the outcome having that information to do a better match.\n",
"\n",
"The idea could be to propose how similar these peoples are. One Idea is to try apply a dimension reduction an see the distance between 2 persons. That would be how similar 2 persons are based on the most important features.\n",
"\n",
"Another way would be to use all most important dimensions and find the eucledian distance.\n",
"\n",
"A third way is would be applyying a clustering model, and find groups of people that are in the same cluster and suppose they would be a couple.\n",
"\n",
"I dont have the time to do all so i will try the third option. Which is definetly possible with out data. Because the clustering is not a superviosiond learning alghoritm.\n",
"\n",
"In all cases would be good to apply a scaling of the data and having ranges between 0 and 1. Also since we miss the sexual preference we should pair with male/female sex only."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2023-06-20T09:46:05.333335800Z"
}
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [
{
"ename": "ValueError",
"evalue": "could not convert string to float: 'Montreal'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)",
"\u001B[1;32m~\\AppData\\Local\\Temp\\ipykernel_21052\\2525431524.py\u001B[0m in \u001B[0;36m?\u001B[1;34m()\u001B[0m\n\u001B[1;32m----> 3\u001B[1;33m \u001B[1;32mfrom\u001B[0m \u001B[0msklearn\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcluster\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mKMeans\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 4\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 5\u001B[0m \u001B[0mkmeans\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mKMeans\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mn_clusters\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;36m2\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mrandom_state\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;36m0\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mn_init\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"auto\"\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mfit\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mdf\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, y, sample_weight)\u001B[0m\n\u001B[0;32m 1413\u001B[0m \u001B[0mFitted\u001B[0m \u001B[0mestimator\u001B[0m\u001B[1;33m.\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1414\u001B[0m \"\"\"\n\u001B[0;32m 1415\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_validate_params\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1416\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1417\u001B[1;33m X = self._validate_data(\n\u001B[0m\u001B[0;32m 1418\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1419\u001B[0m \u001B[0maccept_sparse\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"csr\"\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1420\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;33m[\u001B[0m\u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mfloat64\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mfloat32\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\sklearn\\base.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, y, reset, validate_separately, **check_params)\u001B[0m\n\u001B[0;32m 531\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 532\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 533\u001B[0m \u001B[1;32mraise\u001B[0m \u001B[0mValueError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m\"Validation should be done on X, y or both.\"\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 534\u001B[0m \u001B[1;32melif\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 535\u001B[1;33m \u001B[0mX\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mcheck_array\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0minput_name\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"X\"\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mcheck_params\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 536\u001B[0m \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 537\u001B[0m \u001B[1;32melif\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 538\u001B[0m \u001B[0my\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_check_y\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0my\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mcheck_params\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\sklearn\\utils\\validation.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001B[0m\n\u001B[0;32m 875\u001B[0m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mastype\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;32mFalse\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 876\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 877\u001B[0m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_asarray_with_order\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mxp\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 878\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mComplexWarning\u001B[0m \u001B[1;32mas\u001B[0m \u001B[0mcomplex_warning\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 879\u001B[1;33m raise ValueError(\n\u001B[0m\u001B[0;32m 880\u001B[0m \u001B[1;34m\"Complex data not supported\\n{}\\n\"\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mformat\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 881\u001B[0m ) from complex_warning\n\u001B[0;32m 882\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\sklearn\\utils\\_array_api.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(array, dtype, order, copy, xp)\u001B[0m\n\u001B[0;32m 181\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mxp\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 182\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0m_\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mget_namespace\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 183\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m__name__\u001B[0m \u001B[1;32min\u001B[0m \u001B[1;33m{\u001B[0m\u001B[1;34m\"numpy\"\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;34m\"numpy.array_api\"\u001B[0m\u001B[1;33m}\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 184\u001B[0m \u001B[1;31m# Use NumPy API to support order\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 185\u001B[1;33m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mnumpy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 186\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mcopy\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 187\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 188\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mcopy\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\pandas\\core\\generic.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, dtype)\u001B[0m\n\u001B[0;32m 2069\u001B[0m \u001B[1;32mdef\u001B[0m \u001B[0m__array__\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m:\u001B[0m \u001B[0mnpt\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mDTypeLike\u001B[0m \u001B[1;33m|\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;33m->\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mndarray\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 2070\u001B[1;33m \u001B[1;32mreturn\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_values\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m",
"\u001B[1;31mValueError\u001B[0m: could not convert string to float: 'Montreal'"
]
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"\n",
"kmeans = KMeans(n_clusters=5, random_state=0, n_init=\"auto\").fit(df)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:48:49.160378200Z",
"start_time": "2023-06-20T09:48:49.066666900Z"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}