33 KiB
33 KiB
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn import preprocessingdf = pd.read_csv('titanic.csv')Знакомство с датасетом
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
| 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
| 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
| 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
| 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
| 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
| 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
891 rows × 12 columns
df.info()<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.describe()
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
|---|---|---|---|---|---|---|---|
| count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
| mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
| std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
| min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
| 50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
| 75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
| max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
Предварительная обработка
# Удаляем ненужные столбцы
df = df.drop(['PassengerId', 'Ticket', 'Cabin', 'Name'], axis=1)# Кодируем поле Пол
df.loc[df['Sex'] == 'male', 'Sex'] = 1
df.loc[df['Sex'] == 'female', 'Sex'] = 0
df.Sex = df.Sex.astype(bool)# Кодируем поле Embarked
df.Embarked.fillna(df.Embarked.mode()[0],inplace=True)
le = preprocessing.LabelEncoder()
le.fit(df['Embarked'])
df['Embarked'] = le.transform(df['Embarked'])
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 3 | True | 22.0 | 1 | 0 | 7.2500 | 2 |
| 1 | 1 | 1 | False | 38.0 | 1 | 0 | 71.2833 | 0 |
| 2 | 1 | 3 | False | 26.0 | 0 | 0 | 7.9250 | 2 |
| 3 | 1 | 1 | False | 35.0 | 1 | 0 | 53.1000 | 2 |
| 4 | 0 | 3 | True | 35.0 | 0 | 0 | 8.0500 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 886 | 0 | 2 | True | 27.0 | 0 | 0 | 13.0000 | 2 |
| 887 | 1 | 1 | False | 19.0 | 0 | 0 | 30.0000 | 2 |
| 888 | 0 | 3 | False | NaN | 1 | 2 | 23.4500 | 2 |
| 889 | 1 | 1 | True | 26.0 | 0 | 0 | 30.0000 | 0 |
| 890 | 0 | 3 | True | 32.0 | 0 | 0 | 7.7500 | 1 |
891 rows × 8 columns
# Заполняем возраст медианой
df.Age.fillna(df.Age.median(), inplace = True)df.info() df['Survived'].value_counts()Визуализация датасета
tsne = TSNE(n_components=2,
init="pca",
random_state=0,
perplexity=50,
n_iter = 1000,
metric = 'cosine')Y = tsne.fit_transform(df.iloc[:,1:])plt.scatter(Y[:,0], Y[:,1], c = df.iloc[:,0])# Multidimentional scaling
mds = MDS(n_components=2,
random_state=0)Y_MDS = mds.fit_transform(df.iloc[:,1:])plt.scatter(Y_MDS[:,0], Y_MDS[:,1], c = df.iloc[:,0])Графики по столбцам
sns.countplot(x = df.Survived)sns.countplot(x = df.Pclass, hue = df.Survived)sns.set(rc={'figure.figsize':(10,5)})
sns.histplot(x = df.loc[df.Fare < 200].Fare, hue = df.Survived, kde=True)sns.set(rc={'figure.figsize':(10,5)})
sns.histplot(x = df.Age, hue = df.Survived, kde=True )sns.countplot(x = df.Sex, hue = df.Survived)sns.set(rc={'figure.figsize':(10,5)})
sns.histplot(x = df.SibSp, hue = df.Survived, kde=True )Корреляционная матрица
sns.heatmap(df.corr(numeric_only = True), annot = True, vmin=-1, vmax=1, cmap = 'bwr')Классификация
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,1:], df.iloc[:,0], test_size=0.33, random_state=42) RF = RandomForestClassifier(random_state=42)
RF.fit(X_train, y_train)
rf_prediction = RF.predict(X_test)print('Conf matrix')
print(metrics.confusion_matrix(rf_prediction, y_test))
print('Classification report')
print(metrics.classification_report(rf_prediction, y_test))# Вероятности каждого класса
rf_prediction_proba = RF.predict_proba(X_test)
rf_prediction_probarf_predictionВажность признаков
fi = pd.DataFrame(RF.feature_importances_, RF.feature_names_in_)
fi sns.barplot(fi.T)X_train_saf = X_train[['Sex', 'Age', 'Fare']]
X_test_saf = X_test[['Sex', 'Age', 'Fare']]
RF_saf = RandomForestClassifier(random_state=42)
RF_saf.fit(X_train_saf, y_train)
rf_saf_prediction =RF_saf.predict(X_test_saf )print('Conf matrix')
metrics.confusion_matrix(rf_saf_prediction, y_test)
print('Classification report')
print(metrics.classification_report(rf_saf_prediction, y_test))XGBoost
from xgboost import XGBClassifier
XGB = XGBClassifier(random_state=42)
XGB.fit(X_train, y_train)
xgb_prediction =XGB.predict(X_test )
print('Conf matrix')
print(metrics.confusion_matrix(xgb_prediction, y_test))
print('Classification report')
print(metrics.classification_report(xgb_prediction, y_test))
XGB_saf = XGBClassifier(random_state=42)
XGB_saf.fit(X_train_saf, y_train)
xgb_saf_prediction =XGB_saf.predict(X_test_saf)
print('Conf matrix')
print(metrics.confusion_matrix(xgb_saf_prediction, y_test))
print('Classification report')
print(metrics.classification_report(xgb_saf_prediction, y_test))sns.heatmap(metrics.confusion_matrix(xgb_saf_prediction, y_test), annot = True, cmap = 'RdYlGn')Попробуем настроить параметры для RF
#X_train_sef = X_train[['Sex', 'Age', 'Fare']]
#X_test_sef = X_test[['Sex', 'Age', 'Fare']]
params = {
'n_estimators': [1, 10, 50, 100],
'max_depth': [1, 5, 10],
'criterion':['gini', 'entropy']
}
RF_saf_gs = RandomForestClassifier(random_state=42)
gs = GridSearchCV(param_grid=params, estimator=RF_saf_gs)
gs.fit(X_train_saf, y_train)
rf_saf_gs_prediction=gs.predict(X_test_saf )gs.best_params_print('Conf matrix')
print(metrics.confusion_matrix(rf_saf_gs_prediction, y_test))
print('Classification report')
print(metrics.classification_report(rf_saf_gs_prediction, y_test))