Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

33 KiB

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.manifold import TSNE
from sklearn.manifold import MDS


from sklearn import preprocessing
df = pd.read_csv('titanic.csv')

Знакомство с датасетом

df
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.describe()
PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

Предварительная обработка

# Удаляем ненужные столбцы
df = df.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis=1)
# Кодируем поле Пол
df.loc[df['Sex'] == 'male', 'Sex'] = 1
df.loc[df['Sex'] == 'female', 'Sex'] = 0
df.Sex = df.Sex.astype(bool)
# Кодируем поле Embarked
df.Embarked.fillna(df.Embarked.mode()[0],inplace=True)

le = preprocessing.LabelEncoder()
le.fit(df['Embarked'])
df['Embarked'] = le.transform(df['Embarked'])
df
Survived Pclass Sex Age SibSp Parch Fare Embarked
0 0 3 True 22.0 1 0 7.2500 2
1 1 1 False 38.0 1 0 71.2833 0
2 1 3 False 26.0 0 0 7.9250 2
3 1 1 False 35.0 1 0 53.1000 2
4 0 3 True 35.0 0 0 8.0500 2
... ... ... ... ... ... ... ... ...
886 0 2 True 27.0 0 0 13.0000 2
887 1 1 False 19.0 0 0 30.0000 2
888 0 3 False NaN 1 2 23.4500 2
889 1 1 True 26.0 0 0 30.0000 0
890 0 3 True 32.0 0 0 7.7500 1

891 rows × 8 columns

# Заполняем возраст медианой
df.Age.fillna(df.Age.median(), inplace = True)
df.info()
 df['Survived'].value_counts()

Визуализация датасета

tsne = TSNE(n_components=2, 
             init="pca", 
             random_state=0,
             perplexity=50,
             n_iter = 1000,
             metric = 'cosine')
Y = tsne.fit_transform(df.iloc[:,1:])
plt.scatter(Y[:,0], Y[:,1], c = df.iloc[:,0])
# Multidimentional scaling
mds = MDS(n_components=2, 
             random_state=0)
Y_MDS = mds.fit_transform(df.iloc[:,1:])
plt.scatter(Y_MDS[:,0], Y_MDS[:,1], c = df.iloc[:,0])

Графики по столбцам

sns.countplot(x = df.Survived)
sns.countplot(x = df.Pclass, hue = df.Survived)
sns.set(rc={'figure.figsize':(10,5)})
sns.histplot(x = df.loc[df.Fare < 200].Fare, hue  = df.Survived, kde=True)
sns.set(rc={'figure.figsize':(10,5)})
sns.histplot(x = df.Age, hue  = df.Survived, kde=True )
sns.countplot(x = df.Sex, hue = df.Survived)
sns.set(rc={'figure.figsize':(10,5)})
sns.histplot(x = df.SibSp, hue  = df.Survived, kde=True )

Корреляционная матрица

sns.heatmap(df.corr(numeric_only = True), annot = True,  vmin=-1, vmax=1, cmap = 'bwr')

Классификация

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0], test_size=0.33, random_state=42) 
RF = RandomForestClassifier(random_state=42)
RF.fit(X_train, y_train)
rf_prediction = RF.predict(X_test)
print('Conf matrix')
print(metrics.confusion_matrix(rf_prediction, y_test))
print('Classification report')
print(metrics.classification_report(rf_prediction, y_test))
# Вероятности каждого класса
rf_prediction_proba = RF.predict_proba(X_test)
rf_prediction_proba
rf_prediction

Важность признаков

fi = pd.DataFrame(RF.feature_importances_, RF.feature_names_in_)
fi
 sns.barplot(fi.T)
X_train_saf = X_train[['Sex', 'Age', 'Fare']]
X_test_saf = X_test[['Sex', 'Age', 'Fare']]
RF_saf = RandomForestClassifier(random_state=42)
RF_saf.fit(X_train_saf, y_train)
rf_saf_prediction =RF_saf.predict(X_test_saf )
print('Conf matrix')
metrics.confusion_matrix(rf_saf_prediction, y_test)
print('Classification report')
print(metrics.classification_report(rf_saf_prediction, y_test))

XGBoost

from xgboost import XGBClassifier

XGB = XGBClassifier(random_state=42)
XGB.fit(X_train, y_train)
xgb_prediction =XGB.predict(X_test )
print('Conf matrix')
print(metrics.confusion_matrix(xgb_prediction, y_test))
print('Classification report')
print(metrics.classification_report(xgb_prediction, y_test))

XGB_saf = XGBClassifier(random_state=42)
XGB_saf.fit(X_train_saf, y_train)
xgb_saf_prediction =XGB_saf.predict(X_test_saf)
print('Conf matrix')
print(metrics.confusion_matrix(xgb_saf_prediction, y_test))
print('Classification report')
print(metrics.classification_report(xgb_saf_prediction, y_test))
sns.heatmap(metrics.confusion_matrix(xgb_saf_prediction, y_test), annot = True, cmap = 'RdYlGn')

Попробуем настроить параметры для RF

#X_train_sef = X_train[['Sex', 'Age', 'Fare']]
#X_test_sef = X_test[['Sex', 'Age', 'Fare']]

params = {
    'n_estimators': [1, 10, 50, 100],
    'max_depth': [1, 5, 10],
    'criterion':['gini', 'entropy']
}


RF_saf_gs = RandomForestClassifier(random_state=42)

gs = GridSearchCV(param_grid=params, estimator=RF_saf_gs)
gs.fit(X_train_saf, y_train)
rf_saf_gs_prediction=gs.predict(X_test_saf )
gs.best_params_
print('Conf matrix')
print(metrics.confusion_matrix(rf_saf_gs_prediction, y_test))
print('Classification report')
print(metrics.classification_report(rf_saf_gs_prediction, y_test))