33 KiB
33 KiB
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn import preprocessing
= pd.read_csv('titanic.csv') df
Знакомство с датасетом
df
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
Предварительная обработка
# Удаляем ненужные столбцы
= df.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis=1) df
# Кодируем поле Пол
'Sex'] == 'male', 'Sex'] = 1
df.loc[df['Sex'] == 'female', 'Sex'] = 0
df.loc[df[= df.Sex.astype(bool) df.Sex
# Кодируем поле Embarked
0],inplace=True)
df.Embarked.fillna(df.Embarked.mode()[
= preprocessing.LabelEncoder()
le 'Embarked'])
le.fit(df['Embarked'] = le.transform(df['Embarked'])
df[ df
Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | True | 22.0 | 1 | 0 | 7.2500 | 2 |
1 | 1 | 1 | False | 38.0 | 1 | 0 | 71.2833 | 0 |
2 | 1 | 3 | False | 26.0 | 0 | 0 | 7.9250 | 2 |
3 | 1 | 1 | False | 35.0 | 1 | 0 | 53.1000 | 2 |
4 | 0 | 3 | True | 35.0 | 0 | 0 | 8.0500 | 2 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 0 | 2 | True | 27.0 | 0 | 0 | 13.0000 | 2 |
887 | 1 | 1 | False | 19.0 | 0 | 0 | 30.0000 | 2 |
888 | 0 | 3 | False | NaN | 1 | 2 | 23.4500 | 2 |
889 | 1 | 1 | True | 26.0 | 0 | 0 | 30.0000 | 0 |
890 | 0 | 3 | True | 32.0 | 0 | 0 | 7.7500 | 1 |
891 rows × 8 columns
# Заполняем возраст медианой
= True) df.Age.fillna(df.Age.median(), inplace
df.info()
'Survived'].value_counts() df[
Визуализация датасета
= TSNE(n_components=2,
tsne ="pca",
init=0,
random_state=50,
perplexity= 1000,
n_iter = 'cosine') metric
= tsne.fit_transform(df.iloc[:,1:]) Y
0], Y[:,1], c = df.iloc[:,0]) plt.scatter(Y[:,
# Multidimentional scaling
= MDS(n_components=2,
mds =0) random_state
= mds.fit_transform(df.iloc[:,1:]) Y_MDS
0], Y_MDS[:,1], c = df.iloc[:,0]) plt.scatter(Y_MDS[:,
Графики по столбцам
= df.Survived) sns.countplot(x
= df.Pclass, hue = df.Survived) sns.countplot(x
set(rc={'figure.figsize':(10,5)})
sns.= df.loc[df.Fare < 200].Fare, hue = df.Survived, kde=True) sns.histplot(x
set(rc={'figure.figsize':(10,5)})
sns.= df.Age, hue = df.Survived, kde=True ) sns.histplot(x
= df.Sex, hue = df.Survived) sns.countplot(x
set(rc={'figure.figsize':(10,5)})
sns.= df.SibSp, hue = df.Survived, kde=True ) sns.histplot(x
Корреляционная матрица
= True), annot = True, vmin=-1, vmax=1, cmap = 'bwr') sns.heatmap(df.corr(numeric_only
Классификация
= train_test_split(df.iloc[:,1:], df.iloc[:,0], test_size=0.33, random_state=42) X_train, X_test, y_train, y_test
= RandomForestClassifier(random_state=42)
RF
RF.fit(X_train, y_train)= RF.predict(X_test) rf_prediction
print('Conf matrix')
print(metrics.confusion_matrix(rf_prediction, y_test))
print('Classification report')
print(metrics.classification_report(rf_prediction, y_test))
# Вероятности каждого класса
= RF.predict_proba(X_test)
rf_prediction_proba rf_prediction_proba
rf_prediction
Важность признаков
= pd.DataFrame(RF.feature_importances_, RF.feature_names_in_)
fi fi
sns.barplot(fi.T)
= X_train[['Sex', 'Age', 'Fare']]
X_train_saf = X_test[['Sex', 'Age', 'Fare']]
X_test_saf = RandomForestClassifier(random_state=42)
RF_saf
RF_saf.fit(X_train_saf, y_train)=RF_saf.predict(X_test_saf ) rf_saf_prediction
print('Conf matrix')
metrics.confusion_matrix(rf_saf_prediction, y_test)print('Classification report')
print(metrics.classification_report(rf_saf_prediction, y_test))
XGBoost
from xgboost import XGBClassifier
= XGBClassifier(random_state=42)
XGB
XGB.fit(X_train, y_train)=XGB.predict(X_test )
xgb_prediction print('Conf matrix')
print(metrics.confusion_matrix(xgb_prediction, y_test))
print('Classification report')
print(metrics.classification_report(xgb_prediction, y_test))
= XGBClassifier(random_state=42)
XGB_saf
XGB_saf.fit(X_train_saf, y_train)=XGB_saf.predict(X_test_saf)
xgb_saf_prediction print('Conf matrix')
print(metrics.confusion_matrix(xgb_saf_prediction, y_test))
print('Classification report')
print(metrics.classification_report(xgb_saf_prediction, y_test))
= True, cmap = 'RdYlGn') sns.heatmap(metrics.confusion_matrix(xgb_saf_prediction, y_test), annot
Попробуем настроить параметры для RF
#X_train_sef = X_train[['Sex', 'Age', 'Fare']]
#X_test_sef = X_test[['Sex', 'Age', 'Fare']]
= {
params 'n_estimators': [1, 10, 50, 100],
'max_depth': [1, 5, 10],
'criterion':['gini', 'entropy']
}
= RandomForestClassifier(random_state=42)
RF_saf_gs
= GridSearchCV(param_grid=params, estimator=RF_saf_gs)
gs
gs.fit(X_train_saf, y_train)=gs.predict(X_test_saf ) rf_saf_gs_prediction
gs.best_params_
print('Conf matrix')
print(metrics.confusion_matrix(rf_saf_gs_prediction, y_test))
print('Classification report')
print(metrics.classification_report(rf_saf_gs_prediction, y_test))