demo_eda/testML.ipynb

570 lines
221 KiB
Text
Raw Normal View History

2025-07-12 01:17:12 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:02.486093400Z",
"start_time": "2023-06-20T09:46:02.161449400Z"
}
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import warnings\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.manifold import TSNE\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.metrics import accuracy_score\n",
"from sklearn.tree import DecisionTreeClassifier"
]
},
{
"cell_type": "code",
"execution_count": 3,
"outputs": [
{
"data": {
"text/plain": " City Year Sport Discipline Event \\\n0 Montreal 1976.0 Aquatics Diving 3m springboard \n1 Montreal 1976.0 Aquatics Diving 3m springboard \n2 Montreal 1976.0 Aquatics Diving 3m springboard \n3 Montreal 1976.0 Aquatics Diving 3m springboard \n4 Montreal 1976.0 Aquatics Diving 10m platform \n\n Athlete Gender Country_Code Country Event_gender \\\n0 KÖHLER, Christa Women GDR East Germany W \n1 KOSENKOV, Aleksandr Men URS Soviet Union M \n2 BOGGS, Philip George Men USA United States M \n3 CAGNOTTO, Giorgio Franco Men ITA Italy M \n4 WILSON, Deborah Keplar Women USA United States W \n\n Medal \n0 Silver \n1 Bronze \n2 Gold \n3 Silver \n4 Bronze ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>City</th>\n <th>Year</th>\n <th>Sport</th>\n <th>Discipline</th>\n <th>Event</th>\n <th>Athlete</th>\n <th>Gender</th>\n <th>Country_Code</th>\n <th>Country</th>\n <th>Event_gender</th>\n <th>Medal</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n <td>Montreal</td>\n <td>1976.0</td>\n <td>Aquatics</td>\n <td>Diving</td>\n <td>3m springboard</td>\n <td>KÖHLER, Christa</td>\n <td>Women</td>\n <td>GDR</td>\n <td>East Germany</td>\n <td>W</td>\n <td>Silver</td>\n </tr>\n <tr>\n <th>1</th>\n <td>Montreal</td>\n <td>1976.0</td>\n <td>Aquatics</td>\n <td>Diving</td>\n <td>3m springboard</td>\n <td>KOSENKOV, Aleksandr</td>\n <td>Men</td>\n <td>URS</td>\n <td>Soviet Union</td>\n <td>M</td>\n <td>Bronze</td>\n </tr>\n <tr>\n <th>2</th>\n <td>Montreal</td>\n <td>1976.0</td>\n <td>Aquatics</td>\n <td>Diving</td>\n <td>3m springboard</td>\n <td>BOGGS, Philip George</td>\n <td>Men</td>\n <td>USA</td>\n <td>United States</td>\n <td>M</td>\n <td>Gold</td>\n </tr>\n <tr>\n <th>3</th>\n <td>Montreal</td>\n <td>1976.0</td>\n <td>Aquatics</td>\n <td>Diving</td>\n <td>3m springboard</td>\n <td>CAGNOTTO, Giorgio Franco</td>\n <td>Men</td>\n <td>ITA</td>\n <td>Italy</td>\n <td>M</td>\n <td>Silver</td>\n </tr>\n <tr>\n <th>4</th>\n <td>Montreal</td>\n <td>1976.0</td>\n <td>Aquatics</td>\n <td>Diving</td>\n <td>10m platform</td>\n <td>WILSON, Deborah Keplar</td>\n <td>Women</td>\n <td>USA</td>\n <td>United States</td>\n <td>W</td>\n <td>Bronze</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('data/Summer-Olympic-medals.csv', encoding=\"ISO-8859-1\")\n",
"df.head()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:02.533394500Z",
"start_time": "2023-06-20T09:46:02.486093400Z"
}
}
},
{
"cell_type": "code",
"execution_count": 4,
"outputs": [
{
"data": {
"text/plain": "(15433, 11)"
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:02.564875300Z",
"start_time": "2023-06-20T09:46:02.533394500Z"
}
}
},
{
"cell_type": "code",
"execution_count": 5,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 15433 entries, 0 to 15432\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 City 15316 non-null object \n",
" 1 Year 15316 non-null float64\n",
" 2 Sport 15316 non-null object \n",
" 3 Discipline 15316 non-null object \n",
" 4 Event 15316 non-null object \n",
" 5 Athlete 15316 non-null object \n",
" 6 Gender 15316 non-null object \n",
" 7 Country_Code 15316 non-null object \n",
" 8 Country 15316 non-null object \n",
" 9 Event_gender 15316 non-null object \n",
" 10 Medal 15316 non-null object \n",
"dtypes: float64(1), object(10)\n",
"memory usage: 1.3+ MB\n"
]
}
],
"source": [
"df.info()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:02.627858600Z",
"start_time": "2023-06-20T09:46:02.549048800Z"
}
}
},
{
"cell_type": "code",
"execution_count": 6,
"outputs": [
{
"data": {
"text/plain": " Year\ncount 15316.000000\nmean 1993.620789\nstd 10.159851\nmin 1976.000000\n25% 1984.000000\n50% 1996.000000\n75% 2004.000000\nmax 2008.000000",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>Year</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>count</th>\n <td>15316.000000</td>\n </tr>\n <tr>\n <th>mean</th>\n <td>1993.620789</td>\n </tr>\n <tr>\n <th>std</th>\n <td>10.159851</td>\n </tr>\n <tr>\n <th>min</th>\n <td>1976.000000</td>\n </tr>\n <tr>\n <th>25%</th>\n <td>1984.000000</td>\n </tr>\n <tr>\n <th>50%</th>\n <td>1996.000000</td>\n </tr>\n <tr>\n <th>75%</th>\n <td>2004.000000</td>\n </tr>\n <tr>\n <th>max</th>\n <td>2008.000000</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:02.643454Z",
"start_time": "2023-06-20T09:46:02.580624Z"
}
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "City 9\nYear 9\nSport 28\nDiscipline 41\nEvent 293\nAthlete 11337\nGender 2\nCountry_Code 128\nCountry 127\nEvent_gender 3\nMedal 3\ndtype: int64"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.nunique()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:02.690327500Z",
"start_time": "2023-06-20T09:46:02.612190200Z"
}
}
},
{
"cell_type": "code",
"execution_count": 8,
"outputs": [
{
"data": {
"text/plain": "Empty DataFrame\nColumns: []\nIndex: [0, 1, 2, 3, 4]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>0</th>\n </tr>\n <tr>\n <th>1</th>\n </tr>\n <tr>\n <th>2</th>\n </tr>\n <tr>\n <th>3</th>\n </tr>\n <tr>\n <th>4</th>\n </tr>\n </tbody>\n</table>\n</div>"
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get if there are unique only data in columns\n",
"df_unique = df.loc[:, df.nunique() == 1]\n",
"df_unique.head()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:02.690327500Z",
"start_time": "2023-06-20T09:46:02.612190200Z"
}
}
},
{
"cell_type": "code",
"execution_count": 9,
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: missingno in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (0.5.2)\n",
"Requirement already satisfied: numpy in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from missingno) (1.23.5)\n",
"Requirement already satisfied: matplotlib in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from missingno) (3.5.3)\n",
"Requirement already satisfied: scipy in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from missingno) (1.9.3)\n",
"Requirement already satisfied: seaborn in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from missingno) (0.12.2)\n",
"Requirement already satisfied: cycler>=0.10 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from matplotlib->missingno) (0.11.0)\n",
"Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from matplotlib->missingno) (4.39.4)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from matplotlib->missingno) (1.4.4)\n",
"Requirement already satisfied: packaging>=20.0 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from matplotlib->missingno) (23.1)\n",
"Requirement already satisfied: pillow>=6.2.0 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from matplotlib->missingno) (9.4.0)\n",
"Requirement already satisfied: pyparsing>=2.2.1 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from matplotlib->missingno) (3.0.9)\n",
"Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from matplotlib->missingno) (2.8.2)\n",
"Requirement already satisfied: pandas>=0.25 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from seaborn->missingno) (1.5.3)\n",
"Requirement already satisfied: pytz>=2020.1 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from pandas>=0.25->seaborn->missingno) (2023.3)\n",
"Requirement already satisfied: six>=1.5 in c:\\users\\stefa\\miniconda3\\envs\\mchinelearning\\lib\\site-packages (from python-dateutil>=2.7->matplotlib->missingno) (1.16.0)\n"
]
}
],
"source": [
"# if uncommented, install missingno if not already installed\n",
"!pip install missingno\n",
"# note: to works it need matplotlib=3.5.0\n",
"import missingno as msno"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:04.395738300Z",
"start_time": "2023-06-20T09:46:02.627858600Z"
}
}
},
{
"cell_type": "code",
"execution_count": 10,
"outputs": [
{
"data": {
"text/plain": "<Figure size 2500x1000 with 2 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAACBAAAAOjCAYAAAAIhmViAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAACzwUlEQVR4nOzdd3RUVd/28e+kh4SEEgIEQi+hdwEBqaKAiEjvSO9NmvQqIB1CDb03QZAmgoKU0JsQehGkd0iByZT3D945d0ITfNQQuD5r3evRzMzx7LX2c87Ze1/nt012u92OiIiIiIiIiIiIiIiIiIiIvNec4voEREREREREREREREREREREJO4pQCAiIiIiIiIiIiIiIiIiIiIKEIiIiIiIiIiIiIiIiIiIiIgCBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiIiIiIiIoICBCIiIiIiIiIiIiJxzmazAWC325/77EV/ExERERH5N7jE9QmIiIiIiIiIiIiIvM+sVivOzs4A3Lhxgzt37mCxWHBxcSFHjhyYTCbgaZDA8c8iIvFdzGsf6BonIvK2MNkVXxURERERERERERGJEzabDSenp4ViZ8+ezffff8/Zs2eNz6tUqULlypUpUqQILi4uWmATkXdCzGvf4sWLyZYtG3nz5o3bkxIREUABAhEREREREREREZE4N3r0aEJCQkiRIgVly5bFZDKxbds2rly5QmBgIPXq1aNevXqx3tYVEYnvvvvuO2bNmsVnn33GoEGDSJAgQVyfkojIe09bGIiIiIiIiIiIiIj8R15UQWDdunWEhITw4Ycf0rNnT7JkyQJAu3btaN68Ob///junTp3i0aNHJEqUKA7OWkTknxFz24ITJ06wdOlSChQoQIMGDRQeEBF5SzjF9QmIiIiIiIiIiIiIvOsiIiIAMJlMPFsUdt++fTg7O9OyZUsjPACwZMkSfv/9d4oVK0arVq0wm80cP34c4LljiIjEB47wwKpVq/jxxx958uQJ7dq1I0+ePHF8ZiIi4qAAgYiIiIiIiIiIiMi/aM+ePfTu3ZsjR44AxKpAEBkZyaFDh0iePHms/b+Dg4MZN24cxYoVo1u3bri5udG0aVOWLFny3DFEROKTHTt28M0337B27Vr8/f3x9/cHIDo6Oo7PTEREQAECERERERERERERkX/NgwcPCA4OZuPGjcybN49jx44Zn9ntdqOSwL1797hy5QrwNDwQHBxMsWLF+PrrrwkKCuL69eucO3eO3bt3Ex4eHidtERH5J+TMmZM2bdpw+/Ztrl69yooVKwBwdXXFZrPF8dmJiIhLXJ+AiIiIiIiIiIiIyLvK19eXpk2b4uLiwvr167FarTRr1oycOXNiMpnw8vLiww8/5MyZM/z++++sX78+Vngge/bsAKRJk4YkSZLg5OSEk5PeCxOR+MdqteLs7EyiRIlo0KAB7u7ujB07ltmzZ5M6dWrq1auHk5MTNptN1zkRkTikAIGIiIiIiIiIiIjIv8But2MymShVqhTOzs5YLBZ++ukn7HY7TZs2JXfu3ADkypULm81Gr169sFqtlC5dmjZt2hjhAYADBw5w9+5dypQpQ4IECeKqSSIir+3ZIICzs7Pxz4kTJ6Z69epYrVYmTJjAuHHjcHV1pWbNmgoRiIjEMQUIRERERERERERERP4FJpOJ6OhoXF1dKVGiBC4uLkyePJnNmzfj4uJCw4YNyZMnDxUrVuTQoUPMnz8fJycnChcuTK5cuYzj7Nu3j6lTp+Lu7k7p0qXjsEUiIq/HUW0AIDQ0lPPnz3P48GGCgoJIkyYNH3/8MUmTJqVhw4bYbDaCg4MZNWoUgEIEIiJxzGR3bLIlIiIiIiIiIiIiIv+YmItfmzdv5vfff2fTpk1cuHABV1dXypcvT4MGDcibNy8AvXv35vvvv8fFxYXKlSsTEBBAVFQUq1at4t69e/Tu3ZsGDRrEYYtERP5azGvf+PHjmT17No8fP471nS+//JJGjRqRNWtWwsPDmTNnDsHBwfj4+NC1a1dq1qz53LFEROS/oQCBiIiIiIiIiIiIyL9o9OjRhISEkChRIrJmzYrVauXIkSNYrVbKlStHkyZNjBDB5MmT2bBhA2fOnDF+nzFjRpo0aUK1atUALaiJSPwwadIkJk6cSL58+WjatCleXl7cvHmTyZMn88cff1C6dGk6d+5MlixZePDgAfPnzyc4OJjEiRPToUMH6tSpE9dNEBF5L2kLAxEREREREREREZF/yZo1awgJCaF48eJ069aNrFmzAk8rEqxevZpNmzYBGCGCNm3a8MUXX/DHH39w584dUqVKRfLkyQkICAAUHhCR+OHIkSPMnTuXLFmy0L9/f4KCgozP9u7dyx9//EFERAQ+Pj4A+Pr6UrduXZydnRk/fjwDBw6kQIECZMmSJa6aICLy3lKAQERERERERERERORfsn//ftzc3GjTpg1Zs2YlOjoaV1dXypUrR/r06UmQIAGrV6/GycmJxo0bkzdvXgICAozAQEx2u13hARF5a1y/fh0vLy8SJkz43GcXLlzg4cOH9O7dO1Z4IDg4mO+//54SJUowYMAAIiIimD17Nl999RVJkiShTp06REZGkjRpUoUHRETiiAIEIiIiIiIiIiIiIv8wu93OkydP2LNnD+7u7qRKlQqr1Yqzs7PxnYwZM9K4cWMuXbrE5s2bcXFxwcnJidy5cxvHMJlMxvdj/rOISFzat28fTZs2pV27dtStWxdvb28A4zp37NgxABIkSGD8Jjg4mODgYIoVK0b79u1JlSoVn376KRcvXiR//vzkyZOHRIkS0b59e9zc3ABVXRERiQu66oqIiIiIiIiIiIj8w0wmEx4eHmTKlImoqCiuX7+Os7Pzcwth2bJlo1SpUlgsFrZs2cL06dM5cuSIcQwRkbfR77//jtlsZsqUKaxYsYJHjx4BGCGpjBkzAvDgwQMAJk6caIQHvv76ayMolTlzZuBpYMrBER4AFB4QEYkDuvKKiIiIiIiIiIiI/EuCgoKwWCxMmTKFq1evxvosOjoagMKFC5M0aVJSp07N5s2buXfvXlycqojIa2vSpAk9e/YEYPTo0Xz//fdGiAAgVapUAHz33Xf069ePSZMmUaxYMbp06UL27NmN71ksFlxcXPDy8vpvGyAiIi+lAIGIiIiIiIiIiIjI32Sz2WL9u8ViwWKxGP/etGlT8uTJw+7du1myZAk3b940vufq6grAiRMnePDgAa1bt2bevHmUKlXqPzt/EZE3ZbVaAWjcuDFt27bFxcXFCBE4Kg589NFHVKtWjYcPH7J8+XLy58/P4MGDyZEjh3Gcffv2ERoaSo4cOfD19Y1VhUBEROKOS1yfgIiIiIiIiIiIiEh85NjrG2DLli2EhYVx+PBhnJ2dKVWqFFmzZqVAgQI0adKEESNGsGDBAh49ekTjxo1JmzYtAAcOHGDFihWkT5+e/PnzkyJFCkD7fotI/NCsWTNu377N0qVLGTNmDM7OzlSqVIkkSZLQpUsXbt26xW+//cb169d5+PAhfn5+uLm5sXXrVqZOncrjx49p0KAB/v7+cd0UERH5/0x2RbpERERERERERERE3kjMBf6xY8cyc+ZMLBYLbm5umM1mnJycSJEiBe3atePLL79kzZo1TJs2jXPnzuHj40OxYsWwWq2Ehoby6NEj+vbtS7169eK4VSIirxbz2jdt2jR2795NWFgYUVFRmM1m3N3d6dy5M9WrV8fb25vz588zevRotmzZgslkIlOmTDg7O3Py5EkAvvnmGxo1agSA3W7HZDLFWdtEROQpBQhERERERERERERE/qYpU6Ywfvx4ChU
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"warnings.filterwarnings('ignore')\n",
"msno.matrix(df)\n",
"warnings.filterwarnings('default')"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:04.681718300Z",
"start_time": "2023-06-20T09:46:04.408258500Z"
}
}
},
{
"cell_type": "code",
"execution_count": 11,
"outputs": [
{
"data": {
"text/plain": "<Figure size 2000x1200 with 2 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAABi0AAAQxCAYAAABS9M4qAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdeZjd4/0//ufMZE8QiSR20VTGEiQiQtWSBG0QS8TeELFE1a4oRYui+iX6kTT42FJLUbFr7FtREhIqFSqJ0tgiqCWbSTLz+8Mv8zGdBInEvCfzeFzXuTj3+z7v8zrnjl7NPOe+XyVVVVVVAQAAAAAAqGOldV0AAAAAAABAIrQAAAAAAAAKQmgBAAAAAAAUgtACAAAAAAAoBKEFAAAAAABQCEILAAAAAACgEIQWAAAAAABAIQgtAAAAAACAQhBaAAAAAAAAhSC0AAAAAACABuCjjz7KjjvumDFjxixyzhNPPJF+/fqla9eu6du3bx577LEa16+88spsu+226dq1awYOHJjXX399qdYotAAAAAAAgOXcuHHjsu++++bf//73Iue88cYbOeaYY3Lcccfl+eefzzHHHJPjjz8+06ZNS5Lccccduf7663P11VdnzJgx2WijjXLsscemqqpqqdUptAAAAAAAgOXYHXfckZ///Oc54YQTvnbe5ptvnh122CGNGjXKzjvvnB49euSWW25Jkvz5z3/OAQcckPXWWy9NmzbNSSedlHfeeecrd24sLqEFAAAAAADUMxUVFZkxY0aNR0VFxULn/vCHP8xDDz2UnXfe+SvvOXny5HTu3LnG2Pe///28+uqrC73euHHjdOzYsfr60tBoqd0JAAAAAADqmSbdBtd1CUvk4sHdMnz48BpjRx99dI455phac9u1a/eN7jlz5sw0b968xlizZs0ya9asb3R9aRBaAAAAAABAPTNkyJAccsghNcaaNGnyre7ZvHnzzJkzp8bYnDlz0rJly290fWlwPBQAAAAAANQzTZo0SatWrWo8vm1o0blz50yaNKnG2OTJk7PeeuslSdZbb70a1+fOnZs33nij1pFS34bQAgAAAACABquktKxePpaF3XbbLWPHjs3o0aMzb968jB49OmPHjs3uu++eJNlrr71yww035NVXX83nn3+eiy++OKussko233zzpVaD0AIAAAAAABqobt265e67706SdOrUKX/4wx9yxRVXpEePHhkxYkSGDRuWddddN0kyYMCADBo0KD/72c+y5ZZbZuLEibniiivSuHHjpVZPSVVVVdVSuxsAAAAAANQjTbsfXtclLJHPx11Z1yUsE3ZaAAAAAAAAhdCorgsAAAAAAIC6sqz6Q7Bk7LQAAAAAAAAKQWgBAAAAAAAUguOhAAAAAABosBwPVSx2WgAAAAAAAIUgtAAAAAAAAApBaAEAAAAAABSCnhYAAAAAADRYeloUi50WAAAAAABAIQgtAAAAAACAQhBaAAAAAAAAhaCnBQAAAAAADVZJmZ4WRWKnBQAAAAAAUAhCCwAAAAAAoBAcDwUAAAAAQINVWup4qCKx0wIAAAAAACgEoQUAAAAAAFAIQgsAAAAAAKAQ9LQAAAAAAKDBKtHTolDstAAAAAAAAApBaAEAAAAAABSC46EAAAAAAGiwHA9VLHZaAAAAAAAAhSC0AAAAAAAACkFoAQAAAAAAFIKeFnwjVVVVKSkpqesyAAAAAACWqpJSv9tfJFZjOVZRUZHRo0dnyJAh6dOnTzbeeOP06NEjBxxwQK6//vpUVFTUmD9s2LCUl5fnkksuqTH+l7/8JSeeeOJ3WToAAAAAAA2QnRbLqcmTJ+f444/PpEmT0rx585SXl2ejjTbK+++/nwkTJmTcuHG55ZZbMnLkyKyyyiqLvM9zzz2XE088MZttttl3WD0AAAAAAA2R0GI59Oabb2afffbJzJkzM3DgwPzsZz/LyiuvXH39vffey+mnn56nn346Bx98cG677bY0a9YsBx54YHbeeecacysrK+viIwAAAAAA0AAJLZYzVVVVOemkkzJz5swceeSROeGEE2rNWXXVVTN8+PDsueeemTx5cm699dYMHDgwbdq0SZs2beqgagAAAACAulFSWlbXJfAlelosZ8aNG5cJEyakXbt2+elPf7rIeS1atMiQIUOy+eabV4/9d0+LX/ziFznooIOSJOPHj095eXkGDhyYf/zjHykvL89222230J0Yn3/+eXr06JHu3btnzpw5S/kTAgAAAACwvLLTYjkzevToJMkOO+yQZs2afeXc/v37p3///ou83q1bt0ybNi1/+9vf0qZNm2y99dbp1KlTunTpkvXXXz+vvvpqxowZk6222qrG6x5++OF8+umn2Xfffb+2BgAAAAAAWEBosZx5/fXXkySbbrrpt77Xvvvum44dO+Zvf/tbOnbsmIsuuqj62l577ZXzzjsvd911V63Q4o477kiSrwxEAAAAAACKwPFQxeJ4qOXM9OnTkyRt27Zdpu+z2267pUmTJnnggQcye/bs6vEFOzO+973vpWvXrsu0BgAAAAAAli9Ci+VMWdkXqeD8+fOX6fu0bt06ffr0yaxZs/LQQw9Vj991112ZP3++XRYAAAAAACw2ocVypl27dkmSDz/8cJm/14ABA5J8EVQscOedd6asrCy77777Mn9/AAAAAACWL0KL5UyXLl2SJC+99NLXzv3ss89y4YUX5oknnsi8efMW+71+8IMfZPXVV88zzzyT6dOnZ8KECZkyZUq22WabtG/ffrHvBwAAAADwXSspLauXj+WV0GI506dPnyTJY489ls8///wr595///255ppr8otf/CIlJSWL/V6lpaXZc889M3/+/Dz88MN58MEHk2jADQAAAADAkhFaLGc22WSTbL755nn//fdz+eWXL3Lexx9/nCuuuCJJsu+++1b3wvhvXxdm9O/fPyUlJXnwwQfzyCOPpHXr1unVq9eSfwAAAAAAABosocVy6Ne//nWaNWuWESNG5MILL8ynn35a4/rUqVNz5JFHZurUqenYsWMOP/zwRd6radOmSZIZM2Ys9Pqaa66ZLbfcMs8++2ymTJmSfv36pUmTJkvvwwAAAAAA0GA0qusCWPrWW2+9/PGPf8yRRx6Za665JjfddFO6dOmSVVZZJe+9915eeumlzJ8/P507d84VV1yRli1bLvJea665ZsrKyvLaa6/l4IMPTnl5eU4//fQacwYMGJBnnnkmSbLXXnst088GAAAAALA0lSziFBrqhp0Wy6muXbtm9OjROeaYY7Leeuvl1VdfzYMPPpjXX3893bt3z69+9avcdtttWX311b/yPm3bts15552XNddcM+PGjctjjz1Wa0737t2TJBtssEE22GCDZfJ5AAAAAABY/pVUVVVV1XUR1G/XXnttfvvb3+ass87KgQceWNflAAAAAAB8Y6v0+21dl7BEPrjnF3VdwjLheCiWyJw5c9KsWbO89tprueqqq9KqVavsvvvudV0WAAAAAMBiKSl1PFSRCC1YIiNGjMjIkSPz+eefJ0lOOeWUtGrVqo6rAgAAAACgPhNasEQ23HDDtGjRIi1btsz++++fwYMH13VJAAAAAADUc0ILlsiPf/zj/PjHP67rMgAAAAAAWI4ILQAAAAAAaLD0tCiW0rouAAAAAAAAIBFaAAAAAAAABeF4KAAAAAAAGqxSx0MVip0WAAAAAABAIQgtAAAAAACAQhBaAAAAAAAAhaCnBQAAAAAADVaJnhaFYqcFAAAAAABQCEILAAAAAACgEIQWAAAAAABAIehpAQAAAABAg6WnRbHYaQEAAAAAABSC0AIAAAAAACgEx0MBAAAAANBgOR6qWOy0AAAAAAAACkFoAQAAAAAAFILQAgAAAAAAKAQ9LQAAAAAAaLD0tCgWOy0AAAAAAIBCEFoAAAAAAACF4HgoAAAAAAAaLMdDFYudFgAAAAAAQCEILQAAAAAAgEIQWgAAAAAAAIWgpwUAAAAAAA1WSZmeFkVipwUAAAAAAFAIQgsAAAAAAKAQhBYAAAAAAEAh6GkBAAAAAECDVVKqp0WRCC0ojCbdBtd1CXxLFS9cU9c
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"warnings.filterwarnings('ignore')\n",
"msno.heatmap(df)\n",
"warnings.filterwarnings('default')"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.067628400Z",
"start_time": "2023-06-20T09:46:04.681718300Z"
}
}
},
{
"cell_type": "code",
"execution_count": 12,
"outputs": [
{
"data": {
"text/plain": " City Year Sport Discipline Event Athlete Gender Country_Code Country \\\n770 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n771 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n772 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n773 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n774 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n.. ... ... ... ... ... ... ... ... ... \n882 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n883 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n884 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n885 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n886 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n\n Event_gender Medal \n770 NaN NaN \n771 NaN NaN \n772 NaN NaN \n773 NaN NaN \n774 NaN NaN \n.. ... ... \n882 NaN NaN \n883 NaN NaN \n884 NaN NaN \n885 NaN NaN \n886 NaN NaN \n\n[117 rows x 11 columns]",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>City</th>\n <th>Year</th>\n <th>Sport</th>\n <th>Discipline</th>\n <th>Event</th>\n <th>Athlete</th>\n <th>Gender</th>\n <th>Country_Code</th>\n <th>Country</th>\n <th>Event_gender</th>\n <th>Medal</th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>770</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>771</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>772</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>773</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>774</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>...</th>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n <td>...</td>\n </tr>\n <tr>\n <th>882</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>883</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>884</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>885</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n <tr>\n <th>886</th>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n <td>NaN</td>\n </tr>\n </tbody>\n</table>\n<p>117 rows × 11 columns</p>\n</div>"
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# return only the cols with at least 5 missing values\n",
"df[df.isna().sum(axis=1) > 2]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.225119700Z",
"start_time": "2023-06-20T09:46:05.067628400Z"
}
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"# drop the empty columns"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.225119700Z",
"start_time": "2023-06-20T09:46:05.098890Z"
}
}
},
{
"cell_type": "code",
"execution_count": 14,
"outputs": [
{
"data": {
"text/plain": "Empty DataFrame\nColumns: [City, Year, Sport, Discipline, Event, Athlete, Gender, Country_Code, Country, Event_gender, Medal]\nIndex: []",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>City</th>\n <th>Year</th>\n <th>Sport</th>\n <th>Discipline</th>\n <th>Event</th>\n <th>Athlete</th>\n <th>Gender</th>\n <th>Country_Code</th>\n <th>Country</th>\n <th>Event_gender</th>\n <th>Medal</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table>\n</div>"
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"missing_vals = df.isna().sum(axis=1)\n",
"\n",
"drop_rows = missing_vals[missing_vals > 2].index\n",
"\n",
"df.drop(drop_rows, inplace=True)\n",
"df.reset_index(drop=True, inplace=True)\n",
"\n",
"df[df.isna().sum(axis=1) > 2]"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.225119700Z",
"start_time": "2023-06-20T09:46:05.114909Z"
}
}
},
{
"cell_type": "code",
"execution_count": 15,
"outputs": [
{
"data": {
"text/plain": "Empty DataFrame\nColumns: [City, Year, Sport, Discipline, Event, Athlete, Gender, Country_Code, Country, Event_gender, Medal, feature]\nIndex: []",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>City</th>\n <th>Year</th>\n <th>Sport</th>\n <th>Discipline</th>\n <th>Event</th>\n <th>Athlete</th>\n <th>Gender</th>\n <th>Country_Code</th>\n <th>Country</th>\n <th>Event_gender</th>\n <th>Medal</th>\n <th>feature</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table>\n</div>"
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# verify if there are outliners defined as out of 3 standard deviation\n",
"numerical_cols = df.select_dtypes(include=['float64', 'int64'])\n",
"\n",
"means = numerical_cols.mean()\n",
"stds = numerical_cols.std()\n",
"\n",
"# set the threshold to 3 standard deviations\n",
"threshold = 3\n",
"\n",
"outliers = pd.DataFrame()\n",
"\n",
"for col in numerical_cols.columns:\n",
" col_outliers = df[(df[col] < means[col] - threshold * stds[col]) |\n",
" (df[col] > means[col] + threshold * stds[col])]\n",
" col_outliers['feature'] = col\n",
" outliers = pd.concat([outliers, col_outliers])\n",
"\n",
"outliers.head()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.225119700Z",
"start_time": "2023-06-20T09:46:05.146554900Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 19,
"outputs": [
{
"data": {
"text/plain": "array([[<AxesSubplot:title={'center':'Year'}>]], dtype=object)"
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"text/plain": "<Figure size 640x480 with 1 Axes>",
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAiwAAAGvCAYAAAB4u44CAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMywgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/NK7nSAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAxZUlEQVR4nO3de1xVdb7/8TcbULY3QDG1Tk5HuVReEmnwmqVn9sPHaEoBho+YjtrRJiK7HbS8lE48ND1N5ThzYkotctRs0JiJDmn1GCszITOmsU4Yu5rEMW8oDmwhbuv3Rz/3aSeIwMb93fB6Ph77D7+ftdb+ri/f7+rdXvsSYFmWJQAAAIPZfN0BAACA5hBYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAtLuMjAzFxMTo/fffb7S+e/duxcTEaPXq1Ze4ZwD8RQBfzQ+gvZWXl+vmm29W165d9frrr8tut7trLpdL06ZNU7du3fTqq6+qS5cuPuwpAFPxCguAdhcWFqbly5fr8OHDWrNmjUftqaee0vHjx/Xkk08SVgA0icAC4JL42c9+pptvvll/+MMfdODAAUnSxx9/rC1btujee+/VNddcoyNHjuihhx5SfHy8rrvuOs2aNUv/+7//63Gcw4cPa+HChRo/fryGDBmiMWPGaOHChTp9+rR7m0mTJmnlypWaNWuWRo4cqccee+ySnisA7+OWEIBL5vTp07r55pt1+eWXa8uWLUpKSpLdbteWLVt05swZ3XLLLbLb7br33ntlt9v10ksv6dNPP9W2bds0ePBgVVVVaerUqQoPD9fdd9+tnj17av/+/frv//5vJSUlKTMzU9L3geXYsWNKTU3VjTfeqJCQEMXFxfn47AG0RZCvOwCg8wgPD9fy5ct177336s4779ShQ4f0pz/9SYGBgXrppZdUXl6ul19+WVdccYUkacKECZoyZYp+85vfaO3atfr73/+u/v37a9WqVRo4cKAkafTo0Tpw4IA+/PBDj+e67LLL9Mgjj8hm44VkoCMgsAC4pBwOh6ZMmaL8/Hw99thjuuqqqyRJe/fu1TXXXKN+/fqprq5OkmSz2TRhwgS99tprkqRrrrlGW7ZsUUNDg0pLS/X3v/9dJSUl+uqrr9z7nDN48GDCCtCBEFgAXHI33HCD8vPzdeONN7rbysvL9c0332jIkCGN7lNVVSW73a4XX3xRzz33nE6fPq2IiAgNGTJEdrtdFRUVHttHRES06zkAuLQILACM0LNnT8XHx2vhwoWN1rt06aK8vDytWrVK//mf/6nk5GT17t1bknT//fe738gLoGMisAAwQnx8vPLy8vSv//qv6tGjh7t9xYoV+u677/T4449r//796tmzp+666y533eVyaf/+/QoK4nIGdGTc4AVghNmzZ6uhoUGzZ89Wfn6+9u7dq0cffVQbN27UoEGDJEnDhw9XRUWFVq1apcLCQuXl5Sk1NVUnT55UVVWVj88AQHvif0kAGKFfv37aunWrnnrqKS1fvlzfffedrrrqKq1YsULJycmSpFtvvVWHDx/W9u3btWXLFvXr10833nijbr/9dj366KNyOp2KjIz08ZkAaA98DwsAADAet4QAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAON1uG+6LSurUGNfhRcQIPXp07PJemfG2DSOcWkaY9M4xqVxjEvTGJv/G4PmdLjAYlm64B+9uXpnxtg0jnFpGmPTOMalcYxL0xib5nFLCAAAGI/AAgAAjEdgAQAAxiOwAAAA4xFYAACA8QgsAADAeAQWAABgPAILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxOtyvNQMAYDqbLUA2W4D734GB5r9+0NBgqaHBdz8pTWABAOASstkCFBrWTUE/CCnh4d192KOLU1ffoDPlZ30WWggsAABcQjZbgIICbbp/a5Gcxyt93Z2LEnlZD/1mZqxstgACCwAAnYnzeKU+O/JPX3fDb5h/0wwAAHR6LQosxcXFmjNnjuLj4zVu3DgtXLhQp06dkiR98sknmjFjhmJjYzVp0iTl5OR47JubmyuHw6ERI0YoMTFRRUVF7lp9fb1Wr16tsWPHKjY2VmlpaTp+/LgXTg8A0BI2W4CCgmxefZx7Q2lgoHePe+7xwzevouO66FtC1dXVmjt3rm677TY999xzcrlcevjhh7V48WKtXr1ad911l+677z6lpKRo3759Sk9PV0xMjIYPH67CwkJlZmZq3bp1Gj58uDZv3qy0tDTt2rVLdrtdWVlZ2rNnj7Zv366ePXvq0Ucf1dKlS/X888+357kDAH6gsTeDelN7vbHU128GxaVx0YHlyJEjuvrqq5Wenq7AwEB16dJFKSkpWrhwod58802FhYUpNTVVkjRmzBhNmzZNmzdv1vDhw5WTk6OpU6cqLi5OkjR79my98sorys/PV1JSknJycpSRkaEBAwZIkpYsWaLx48ertLRUV155ZTucNgDgx3gzKEx20YFl0KBBWr9+vUfbzp07NWTIEJWUlCg6OtqjFhkZqW3btkmSnE6nkpKSzqsXFxeroqJCR48e9dg/IiJCoaGhOnjwYIsDS0ATrwyea2+q3pkxNo1jXJrG2DSuo4yLP74Z1B++x+Qcf+prY7w9vy/2eK36lJBlWVqzZo127dqlTZs2aePGjbLb7R7bhISE6OzZs5Ikl8vVZN3lckmSunXrdl79XK0l+vTp2aZ6Z8bYNI5xaRpj0zjG5dLp26Or6hss9eplb35jtJkvvy+mxYGlsrJSixYt0meffaZNmzYpJiZGdrtdFRUVHttVV1ere/fvT8xut6u6uvq8enh4uDvIVFVVNbl/S5SVVchq5FXBgIDvLyJN1TszxqZxjEvTGJvG+fu4BAba/OILzH6olz1IgbYAv7qNdVNMXy2YfLWvu9Eqp0+7VF/f4NVjnls3zWlRYDl06JDmzZunyy+/XNu2bVPv3r0lSdHR0dqzZ4/Htk6nU1FRUZKkqKgolZSUnFefMGGCQkND1a9fPzmdTvdtoRMnTqi8vPy820wXw7J0wQtFc/XOjLFpHOPSNMamcYzLpedPt7EG9/WvUPhjvprbF30j7cyZM5o1a5ZGjhypDRs2uMOKJDkcDp08eVLZ2dmqra1VQUGB8vLy3O9bSU5OVl5engoKClRbW6vs7GyVlZXJ4XBIkhITE5WVlaXS0lJVVlZq5cqVio+P18CBA718ugAAwB9d9Cssr776qo4cOaI33nhDO3bs8KgVFRXphRde0IoVK7R27Vr17t1bS5cu1ejRoyV9/6mhZcuWafny5Tp27JgiIyO1bt06hYWFSZLS09NVV1en1NRUuVwujRo1SmvWrPHaSQIAAP920YFlzpw5mjNnTpP1YcOGaevWrU3WExISlJCQ0GgtODhYGRkZysjIuNjuAACATsS/P1sFAAA6BQILAAAwHoEFAAAYj8ACAACMR2ABAADGI7AAAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYLwgX3cAADoqmy1ANluAr7tx0QID+X9YmIvAAgDtwGYLUGhYNwURAgCvILAAQDuw2QIUFGjT/VuL5Dxe6evuXJSbYvpqweSrfd0NoFEEFgBoR87jlfrsyD993Y2LMrhvd193AWgSr1UCAADjEVgAAIDxCCwAAMB4BBYAAGA8AgsAADAegQUAABiPwAIAAIxHYAEAAMYjsAAAAOMRWAAAgPEILAAAwHgEFgAAYLxWB5ZTp07J4XCosLBQkvTYY48pNjbW43HNNdfoP/7jP9z7/PznP9d1113nsc2XX34pSaqvr9fq1as1duxYxcbGKi0tTcePH2/j6QEAgI6gVYFl//79SklJ0aFDh9xtjz/+uIqKityP3/72t+rVq5ceeeQRSVJlZaW+/vpr5efne2w3ePBgSVJWVpb27Nmj7du3a/fu3QoJCdHSpUu9cIoAAMDftTiw5Ob
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# check distribution\n",
"df_numeric_ = df.select_dtypes(include=[np.number])\n",
"df_numeric_.hist()"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:47:41.224385500Z",
"start_time": "2023-06-20T09:47:41.114550700Z"
}
}
},
{
"cell_type": "code",
"execution_count": 16,
"outputs": [
{
"ename": "ValueError",
"evalue": "No variables found for grid columns.",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[16], line 2\u001B[0m\n\u001B[0;32m 1\u001B[0m \u001B[38;5;66;03m# check relation with df pairplot\u001B[39;00m\n\u001B[1;32m----> 2\u001B[0m \u001B[43msns\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mpairplot\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdf\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mvars\u001B[39;49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mnumerical_cols\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mcolumns\u001B[49m\u001B[43m[\u001B[49m\u001B[43m:\u001B[49m\u001B[38;5;241;43m-\u001B[39;49m\u001B[38;5;241;43m1\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mhue\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[38;5;124;43mGender\u001B[39;49m\u001B[38;5;124;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\seaborn\\axisgrid.py:2114\u001B[0m, in \u001B[0;36mpairplot\u001B[1;34m(data, hue, hue_order, palette, vars, x_vars, y_vars, kind, diag_kind, markers, height, aspect, corner, dropna, plot_kws, diag_kws, grid_kws, size)\u001B[0m\n\u001B[0;32m 2112\u001B[0m \u001B[38;5;66;03m# Set up the PairGrid\u001B[39;00m\n\u001B[0;32m 2113\u001B[0m grid_kws\u001B[38;5;241m.\u001B[39msetdefault(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mdiag_sharey\u001B[39m\u001B[38;5;124m\"\u001B[39m, diag_kind \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mhist\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m-> 2114\u001B[0m grid \u001B[38;5;241m=\u001B[39m \u001B[43mPairGrid\u001B[49m\u001B[43m(\u001B[49m\u001B[43mdata\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43mvars\u001B[39;49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;28;43mvars\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mx_vars\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mx_vars\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43my_vars\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43my_vars\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mhue\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhue\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 2115\u001B[0m \u001B[43m \u001B[49m\u001B[43mhue_order\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mhue_order\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mpalette\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mpalette\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcorner\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mcorner\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 2116\u001B[0m \u001B[43m \u001B[49m\u001B[43mheight\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mheight\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43maspect\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43maspect\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mdropna\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mdropna\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mgrid_kws\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 2118\u001B[0m \u001B[38;5;66;03m# Add the markers here as PairGrid has figured out how many levels of the\u001B[39;00m\n\u001B[0;32m 2119\u001B[0m \u001B[38;5;66;03m# hue variable are needed and we don't want to duplicate that process\u001B[39;00m\n\u001B[0;32m 2120\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m markers \u001B[38;5;129;01mis\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m:\n",
"File \u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\seaborn\\axisgrid.py:1266\u001B[0m, in \u001B[0;36mPairGrid.__init__\u001B[1;34m(self, data, hue, vars, x_vars, y_vars, hue_order, palette, hue_kws, corner, diag_sharey, height, aspect, layout_pad, despine, dropna)\u001B[0m\n\u001B[0;32m 1263\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39msquare_grid \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mx_vars \u001B[38;5;241m==\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39my_vars\n\u001B[0;32m 1265\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m x_vars:\n\u001B[1;32m-> 1266\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNo variables found for grid columns.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[0;32m 1267\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m y_vars:\n\u001B[0;32m 1268\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mNo variables found for grid rows.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n",
"\u001B[1;31mValueError\u001B[0m: No variables found for grid columns."
]
}
],
"source": [
"# check relation with df pairplot\n",
"sns.pairplot(df, vars=numerical_cols.columns[:-1], hue=\"Gender\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:46:05.364587500Z",
"start_time": "2023-06-20T09:46:05.162190600Z"
}
}
},
{
"cell_type": "raw",
"source": [
"The best data would be to have already an information on how 2 persons are vewing each other and try to infere the missings. But in reality it would be harder have that and we could try another approach.\n",
"\n",
"We could propose how similar one person is to another and then propose a % of match.\n",
"\n",
"We only have the biological gender and not the sexual orientation, we would greatly increase the precision of the outcome having that information to do a better match.\n",
"\n",
"The idea could be to propose how similar these peoples are. One Idea is to try apply a dimension reduction an see the distance between 2 persons. That would be how similar 2 persons are based on the most important features.\n",
"\n",
"Another way would be to use all most important dimensions and find the eucledian distance.\n",
"\n",
"A third way is would be applyying a clustering model, and find groups of people that are in the same cluster and suppose they would be a couple.\n",
"\n",
"I dont have the time to do all so i will try the third option. Which is definetly possible with out data. Because the clustering is not a superviosiond learning alghoritm.\n",
"\n",
"In all cases would be good to apply a scaling of the data and having ranges between 0 and 1. Also since we miss the sexual preference we should pair with male/female sex only."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"\n"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"start_time": "2023-06-20T09:46:05.333335800Z"
}
}
},
{
"cell_type": "code",
"execution_count": 20,
"outputs": [
{
"ename": "ValueError",
"evalue": "could not convert string to float: 'Montreal'",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mValueError\u001B[0m Traceback (most recent call last)",
"\u001B[1;32m~\\AppData\\Local\\Temp\\ipykernel_21052\\2525431524.py\u001B[0m in \u001B[0;36m?\u001B[1;34m()\u001B[0m\n\u001B[1;32m----> 3\u001B[1;33m \u001B[1;32mfrom\u001B[0m \u001B[0msklearn\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcluster\u001B[0m \u001B[1;32mimport\u001B[0m \u001B[0mKMeans\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 4\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 5\u001B[0m \u001B[0mkmeans\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mKMeans\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mn_clusters\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;36m2\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mrandom_state\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;36m0\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mn_init\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"auto\"\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mfit\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mdf\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\sklearn\\cluster\\_kmeans.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, y, sample_weight)\u001B[0m\n\u001B[0;32m 1413\u001B[0m \u001B[0mFitted\u001B[0m \u001B[0mestimator\u001B[0m\u001B[1;33m.\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1414\u001B[0m \"\"\"\n\u001B[0;32m 1415\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_validate_params\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1416\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 1417\u001B[1;33m X = self._validate_data(\n\u001B[0m\u001B[0;32m 1418\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1419\u001B[0m \u001B[0maccept_sparse\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"csr\"\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 1420\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;33m[\u001B[0m\u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mfloat64\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mfloat32\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m,\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\sklearn\\base.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, X, y, reset, validate_separately, **check_params)\u001B[0m\n\u001B[0;32m 531\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 532\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 533\u001B[0m \u001B[1;32mraise\u001B[0m \u001B[0mValueError\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;34m\"Validation should be done on X, y or both.\"\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 534\u001B[0m \u001B[1;32melif\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 535\u001B[1;33m \u001B[0mX\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mcheck_array\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mX\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0minput_name\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m\"X\"\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mcheck_params\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 536\u001B[0m \u001B[0mout\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mX\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 537\u001B[0m \u001B[1;32melif\u001B[0m \u001B[0mno_val_X\u001B[0m \u001B[1;32mand\u001B[0m \u001B[1;32mnot\u001B[0m \u001B[0mno_val_y\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 538\u001B[0m \u001B[0my\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_check_y\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0my\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;33m**\u001B[0m\u001B[0mcheck_params\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\sklearn\\utils\\validation.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator, input_name)\u001B[0m\n\u001B[0;32m 875\u001B[0m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mastype\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;32mFalse\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 876\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 877\u001B[0m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0m_asarray_with_order\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mxp\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 878\u001B[0m \u001B[1;32mexcept\u001B[0m \u001B[0mComplexWarning\u001B[0m \u001B[1;32mas\u001B[0m \u001B[0mcomplex_warning\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 879\u001B[1;33m raise ValueError(\n\u001B[0m\u001B[0;32m 880\u001B[0m \u001B[1;34m\"Complex data not supported\\n{}\\n\"\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mformat\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 881\u001B[0m ) from complex_warning\n\u001B[0;32m 882\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\sklearn\\utils\\_array_api.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(array, dtype, order, copy, xp)\u001B[0m\n\u001B[0;32m 181\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mxp\u001B[0m \u001B[1;32mis\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 182\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0m_\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mget_namespace\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 183\u001B[0m \u001B[1;32mif\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m__name__\u001B[0m \u001B[1;32min\u001B[0m \u001B[1;33m{\u001B[0m\u001B[1;34m\"numpy\"\u001B[0m\u001B[1;33m,\u001B[0m \u001B[1;34m\"numpy.array_api\"\u001B[0m\u001B[1;33m}\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 184\u001B[0m \u001B[1;31m# Use NumPy API to support order\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m--> 185\u001B[1;33m \u001B[0marray\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mnumpy\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0morder\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0morder\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m 186\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mcopy\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 187\u001B[0m \u001B[1;32melse\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m 188\u001B[0m \u001B[1;32mreturn\u001B[0m \u001B[0mxp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0marray\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mcopy\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mcopy\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
"\u001B[1;32m~\\miniconda3\\envs\\MchineLearning\\lib\\site-packages\\pandas\\core\\generic.py\u001B[0m in \u001B[0;36m?\u001B[1;34m(self, dtype)\u001B[0m\n\u001B[0;32m 2069\u001B[0m \u001B[1;32mdef\u001B[0m \u001B[0m__array__\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m:\u001B[0m \u001B[0mnpt\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mDTypeLike\u001B[0m \u001B[1;33m|\u001B[0m \u001B[1;32mNone\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;32mNone\u001B[0m\u001B[1;33m)\u001B[0m \u001B[1;33m->\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mndarray\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 2070\u001B[1;33m \u001B[1;32mreturn\u001B[0m \u001B[0mnp\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0masarray\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_values\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mdtype\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdtype\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m",
"\u001B[1;31mValueError\u001B[0m: could not convert string to float: 'Montreal'"
]
}
],
"source": [
"from sklearn.cluster import KMeans\n",
"\n",
"kmeans = KMeans(n_clusters=5, random_state=0, n_init=\"auto\").fit(df)"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-06-20T09:48:49.160378200Z",
"start_time": "2023-06-20T09:48:49.066666900Z"
}
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}