{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2b6fb168",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "\n",
    "\n",
    "from sklearn import metrics\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold\n",
    "\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "from sklearn.manifold import TSNE\n",
    "from sklearn.manifold import MDS\n",
    "\n",
    "\n",
    "from sklearn import preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "75f28a82",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('titanic.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "34567fe6",
   "metadata": {},
   "source": [
    "# Знакомство с датасетом"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "26b735e0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Name</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Ticket</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Cabin</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Braund, Mr. Owen Harris</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>A/5 21171</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Cumings, Mrs. John Bradley (Florence Briggs Th...</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>PC 17599</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>C85</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>Heikkinen, Miss. Laina</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>STON/O2. 3101282</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td>\n",
       "      <td>female</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>113803</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>C123</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Allen, Mr. William Henry</td>\n",
       "      <td>male</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>373450</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>887</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>Montvila, Rev. Juozas</td>\n",
       "      <td>male</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>211536</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>888</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Graham, Miss. Margaret Edith</td>\n",
       "      <td>female</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>112053</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>B42</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>889</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Johnston, Miss. Catherine Helen \"Carrie\"</td>\n",
       "      <td>female</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>W./C. 6607</td>\n",
       "      <td>23.4500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>S</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>890</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>Behr, Mr. Karl Howell</td>\n",
       "      <td>male</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>111369</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>C148</td>\n",
       "      <td>C</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>891</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Dooley, Mr. Patrick</td>\n",
       "      <td>male</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>370376</td>\n",
       "      <td>7.7500</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Q</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     PassengerId  Survived  Pclass  \\\n",
       "0              1         0       3   \n",
       "1              2         1       1   \n",
       "2              3         1       3   \n",
       "3              4         1       1   \n",
       "4              5         0       3   \n",
       "..           ...       ...     ...   \n",
       "886          887         0       2   \n",
       "887          888         1       1   \n",
       "888          889         0       3   \n",
       "889          890         1       1   \n",
       "890          891         0       3   \n",
       "\n",
       "                                                  Name     Sex   Age  SibSp  \\\n",
       "0                              Braund, Mr. Owen Harris    male  22.0      1   \n",
       "1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   \n",
       "2                               Heikkinen, Miss. Laina  female  26.0      0   \n",
       "3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   \n",
       "4                             Allen, Mr. William Henry    male  35.0      0   \n",
       "..                                                 ...     ...   ...    ...   \n",
       "886                              Montvila, Rev. Juozas    male  27.0      0   \n",
       "887                       Graham, Miss. Margaret Edith  female  19.0      0   \n",
       "888           Johnston, Miss. Catherine Helen \"Carrie\"  female   NaN      1   \n",
       "889                              Behr, Mr. Karl Howell    male  26.0      0   \n",
       "890                                Dooley, Mr. Patrick    male  32.0      0   \n",
       "\n",
       "     Parch            Ticket     Fare Cabin Embarked  \n",
       "0        0         A/5 21171   7.2500   NaN        S  \n",
       "1        0          PC 17599  71.2833   C85        C  \n",
       "2        0  STON/O2. 3101282   7.9250   NaN        S  \n",
       "3        0            113803  53.1000  C123        S  \n",
       "4        0            373450   8.0500   NaN        S  \n",
       "..     ...               ...      ...   ...      ...  \n",
       "886      0            211536  13.0000   NaN        S  \n",
       "887      0            112053  30.0000   B42        S  \n",
       "888      2        W./C. 6607  23.4500   NaN        S  \n",
       "889      0            111369  30.0000  C148        C  \n",
       "890      0            370376   7.7500   NaN        Q  \n",
       "\n",
       "[891 rows x 12 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "546f361a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 891 entries, 0 to 890\n",
      "Data columns (total 12 columns):\n",
      " #   Column       Non-Null Count  Dtype  \n",
      "---  ------       --------------  -----  \n",
      " 0   PassengerId  891 non-null    int64  \n",
      " 1   Survived     891 non-null    int64  \n",
      " 2   Pclass       891 non-null    int64  \n",
      " 3   Name         891 non-null    object \n",
      " 4   Sex          891 non-null    object \n",
      " 5   Age          714 non-null    float64\n",
      " 6   SibSp        891 non-null    int64  \n",
      " 7   Parch        891 non-null    int64  \n",
      " 8   Ticket       891 non-null    object \n",
      " 9   Fare         891 non-null    float64\n",
      " 10  Cabin        204 non-null    object \n",
      " 11  Embarked     889 non-null    object \n",
      "dtypes: float64(2), int64(5), object(5)\n",
      "memory usage: 83.7+ KB\n"
     ]
    }
   ],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7136451b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>PassengerId</th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>891.000000</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>714.000000</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>891.000000</td>\n",
       "      <td>891.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>446.000000</td>\n",
       "      <td>0.383838</td>\n",
       "      <td>2.308642</td>\n",
       "      <td>29.699118</td>\n",
       "      <td>0.523008</td>\n",
       "      <td>0.381594</td>\n",
       "      <td>32.204208</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>257.353842</td>\n",
       "      <td>0.486592</td>\n",
       "      <td>0.836071</td>\n",
       "      <td>14.526497</td>\n",
       "      <td>1.102743</td>\n",
       "      <td>0.806057</td>\n",
       "      <td>49.693429</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.420000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>223.500000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>20.125000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>7.910400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>446.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>28.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>14.454200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>668.500000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>38.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>31.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>891.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>512.329200</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       PassengerId    Survived      Pclass         Age       SibSp  \\\n",
       "count   891.000000  891.000000  891.000000  714.000000  891.000000   \n",
       "mean    446.000000    0.383838    2.308642   29.699118    0.523008   \n",
       "std     257.353842    0.486592    0.836071   14.526497    1.102743   \n",
       "min       1.000000    0.000000    1.000000    0.420000    0.000000   \n",
       "25%     223.500000    0.000000    2.000000   20.125000    0.000000   \n",
       "50%     446.000000    0.000000    3.000000   28.000000    0.000000   \n",
       "75%     668.500000    1.000000    3.000000   38.000000    1.000000   \n",
       "max     891.000000    1.000000    3.000000   80.000000    8.000000   \n",
       "\n",
       "            Parch        Fare  \n",
       "count  891.000000  891.000000  \n",
       "mean     0.381594   32.204208  \n",
       "std      0.806057   49.693429  \n",
       "min      0.000000    0.000000  \n",
       "25%      0.000000    7.910400  \n",
       "50%      0.000000   14.454200  \n",
       "75%      0.000000   31.000000  \n",
       "max      6.000000  512.329200  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f55a184",
   "metadata": {},
   "source": [
    "# Предварительная обработка"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "6991cc16",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Удаляем ненужные столбцы\n",
    "df = df.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "7826ffb8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Кодируем поле Пол\n",
    "df.loc[df['Sex'] == 'male', 'Sex'] = 1\n",
    "df.loc[df['Sex'] == 'female', 'Sex'] = 0\n",
    "df.Sex = df.Sex.astype(bool)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc36c0d9",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "f90a0ba2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Parch</th>\n",
       "      <th>Fare</th>\n",
       "      <th>Embarked</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>7.2500</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>71.2833</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>35.0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>53.1000</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>35.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>8.0500</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>886</th>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>True</td>\n",
       "      <td>27.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>13.0000</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>887</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>False</td>\n",
       "      <td>19.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>888</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>23.4500</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>889</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>30.0000</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>890</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>32.0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.7500</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>891 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Survived  Pclass    Sex   Age  SibSp  Parch     Fare  Embarked\n",
       "0           0       3   True  22.0      1      0   7.2500         2\n",
       "1           1       1  False  38.0      1      0  71.2833         0\n",
       "2           1       3  False  26.0      0      0   7.9250         2\n",
       "3           1       1  False  35.0      1      0  53.1000         2\n",
       "4           0       3   True  35.0      0      0   8.0500         2\n",
       "..        ...     ...    ...   ...    ...    ...      ...       ...\n",
       "886         0       2   True  27.0      0      0  13.0000         2\n",
       "887         1       1  False  19.0      0      0  30.0000         2\n",
       "888         0       3  False   NaN      1      2  23.4500         2\n",
       "889         1       1   True  26.0      0      0  30.0000         0\n",
       "890         0       3   True  32.0      0      0   7.7500         1\n",
       "\n",
       "[891 rows x 8 columns]"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Кодируем поле Embarked\n",
    "df.Embarked.fillna(df.Embarked.mode()[0],inplace=True)\n",
    "\n",
    "le = preprocessing.LabelEncoder()\n",
    "le.fit(df['Embarked'])\n",
    "df['Embarked'] = le.transform(df['Embarked'])\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01a0a715",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2aaacde7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Заполняем возраст медианой\n",
    "df.Age.fillna(df.Age.median(), inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17786cd0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45039730",
   "metadata": {},
   "outputs": [],
   "source": [
    " df['Survived'].value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8a170235",
   "metadata": {},
   "source": [
    "# Визуализация датасета"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8abc136d",
   "metadata": {},
   "outputs": [],
   "source": [
    "tsne = TSNE(n_components=2, \n",
    "             init=\"pca\", \n",
    "             random_state=0,\n",
    "             perplexity=50,\n",
    "             n_iter = 1000,\n",
    "             metric = 'cosine')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b248c498",
   "metadata": {},
   "outputs": [],
   "source": [
    "Y = tsne.fit_transform(df.iloc[:,1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cccd1aa8",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(Y[:,0], Y[:,1], c = df.iloc[:,0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5401296d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Multidimentional scaling\n",
    "mds = MDS(n_components=2, \n",
    "             random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "835c6abe",
   "metadata": {},
   "outputs": [],
   "source": [
    "Y_MDS = mds.fit_transform(df.iloc[:,1:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c0306f25",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(Y_MDS[:,0], Y_MDS[:,1], c = df.iloc[:,0])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f272811a",
   "metadata": {},
   "source": [
    "\n",
    "## Графики по столбцам"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f42d003",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.countplot(x = df.Survived)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "959232fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.countplot(x = df.Pclass, hue = df.Survived)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00bbd94f",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(rc={'figure.figsize':(10,5)})\n",
    "sns.histplot(x = df.loc[df.Fare < 200].Fare, hue  = df.Survived, kde=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ae491dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(rc={'figure.figsize':(10,5)})\n",
    "sns.histplot(x = df.Age, hue  = df.Survived, kde=True )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aafbfaaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.countplot(x = df.Sex, hue = df.Survived)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ddb15368",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.set(rc={'figure.figsize':(10,5)})\n",
    "sns.histplot(x = df.SibSp, hue  = df.Survived, kde=True )\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e34f4ec",
   "metadata": {},
   "source": [
    "## Корреляционная матрица"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "52722264",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(df.corr(numeric_only = True), annot = True,  vmin=-1, vmax=1, cmap = 'bwr')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "345f8d73",
   "metadata": {},
   "source": [
    "# Классификация"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ce53d2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0], test_size=0.33, random_state=42) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f868d885",
   "metadata": {},
   "outputs": [],
   "source": [
    "RF = RandomForestClassifier(random_state=42)\n",
    "RF.fit(X_train, y_train)\n",
    "rf_prediction = RF.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "378e01b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Conf matrix')\n",
    "print(metrics.confusion_matrix(rf_prediction, y_test))\n",
    "print('Classification report')\n",
    "print(metrics.classification_report(rf_prediction, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7087f545",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Вероятности каждого класса\n",
    "rf_prediction_proba = RF.predict_proba(X_test)\n",
    "rf_prediction_proba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34f7f23b",
   "metadata": {},
   "outputs": [],
   "source": [
    "rf_prediction"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7dba003",
   "metadata": {},
   "source": [
    "## Важность признаков"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5652417",
   "metadata": {},
   "outputs": [],
   "source": [
    "fi = pd.DataFrame(RF.feature_importances_, RF.feature_names_in_)\n",
    "fi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6ca8c32a",
   "metadata": {},
   "outputs": [],
   "source": [
    " sns.barplot(fi.T)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e8fea763",
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_saf = X_train[['Sex', 'Age', 'Fare']]\n",
    "X_test_saf = X_test[['Sex', 'Age', 'Fare']]\n",
    "RF_saf = RandomForestClassifier(random_state=42)\n",
    "RF_saf.fit(X_train_saf, y_train)\n",
    "rf_saf_prediction =RF_saf.predict(X_test_saf )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7ba5e827",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Conf matrix')\n",
    "metrics.confusion_matrix(rf_saf_prediction, y_test)\n",
    "print('Classification report')\n",
    "print(metrics.classification_report(rf_saf_prediction, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81cffad2",
   "metadata": {},
   "source": [
    "## XGBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4dae05fa",
   "metadata": {},
   "outputs": [],
   "source": [
    "from xgboost import XGBClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "824c59ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "XGB = XGBClassifier(random_state=42)\n",
    "XGB.fit(X_train, y_train)\n",
    "xgb_prediction =XGB.predict(X_test )\n",
    "print('Conf matrix')\n",
    "print(metrics.confusion_matrix(xgb_prediction, y_test))\n",
    "print('Classification report')\n",
    "print(metrics.classification_report(xgb_prediction, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7b73d67b",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "XGB_saf = XGBClassifier(random_state=42)\n",
    "XGB_saf.fit(X_train_saf, y_train)\n",
    "xgb_saf_prediction =XGB_saf.predict(X_test_saf)\n",
    "print('Conf matrix')\n",
    "print(metrics.confusion_matrix(xgb_saf_prediction, y_test))\n",
    "print('Classification report')\n",
    "print(metrics.classification_report(xgb_saf_prediction, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3964f444",
   "metadata": {},
   "outputs": [],
   "source": [
    "sns.heatmap(metrics.confusion_matrix(xgb_saf_prediction, y_test), annot = True, cmap = 'RdYlGn')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "837596c9",
   "metadata": {},
   "source": [
    "## Попробуем настроить параметры для RF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8ea4088c",
   "metadata": {},
   "outputs": [],
   "source": [
    "#X_train_sef = X_train[['Sex', 'Age', 'Fare']]\n",
    "#X_test_sef = X_test[['Sex', 'Age', 'Fare']]\n",
    "\n",
    "params = {\n",
    "    'n_estimators': [1, 10, 50, 100],\n",
    "    'max_depth': [1, 5, 10],\n",
    "    'criterion':['gini', 'entropy']\n",
    "}\n",
    "\n",
    "\n",
    "RF_saf_gs = RandomForestClassifier(random_state=42)\n",
    "\n",
    "gs = GridSearchCV(param_grid=params, estimator=RF_saf_gs)\n",
    "gs.fit(X_train_saf, y_train)\n",
    "rf_saf_gs_prediction=gs.predict(X_test_saf )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1eaa5d33",
   "metadata": {},
   "outputs": [],
   "source": [
    "gs.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bb7206ef",
   "metadata": {},
   "outputs": [],
   "source": [
    "print('Conf matrix')\n",
    "print(metrics.confusion_matrix(rf_saf_gs_prediction, y_test))\n",
    "print('Classification report')\n",
    "print(metrics.classification_report(rf_saf_gs_prediction, y_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "08d86012",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e98ede8",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}