Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
443 KiB
443 KiB
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
import pickle
Загрузка и знакомство с данными
# dataset https://www.kaggle.com/datasets/mrdaniilak/russia-real-estate-20182021/data
= pd.read_csv('data/all_v2.csv') df
10) df.head(
price | date | time | geo_lat | geo_lon | region | building_type | level | levels | rooms | area | kitchen_area | object_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6050000 | 2018-02-19 | 20:00:21 | 59.805808 | 30.376141 | 2661 | 1 | 8 | 10 | 3 | 82.6 | 10.8 | 1 |
1 | 8650000 | 2018-02-27 | 12:04:54 | 55.683807 | 37.297405 | 81 | 3 | 5 | 24 | 2 | 69.1 | 12.0 | 1 |
2 | 4000000 | 2018-02-28 | 15:44:00 | 56.295250 | 44.061637 | 2871 | 1 | 5 | 9 | 3 | 66.0 | 10.0 | 1 |
3 | 1850000 | 2018-03-01 | 11:24:52 | 44.996132 | 39.074783 | 2843 | 4 | 12 | 16 | 2 | 38.0 | 5.0 | 11 |
4 | 5450000 | 2018-03-01 | 17:42:43 | 55.918767 | 37.984642 | 81 | 3 | 13 | 14 | 2 | 60.0 | 10.0 | 1 |
5 | 3300000 | 2018-03-02 | 21:18:42 | 55.908253 | 37.726448 | 81 | 1 | 4 | 5 | 1 | 32.0 | 6.0 | 1 |
6 | 4704280 | 2018-03-04 | 12:35:25 | 55.621097 | 37.431002 | 3 | 2 | 1 | 25 | 1 | 31.7 | 6.0 | 11 |
7 | 3600000 | 2018-03-04 | 20:52:38 | 59.875526 | 30.395457 | 2661 | 1 | 2 | 5 | 1 | 31.1 | 6.0 | 1 |
8 | 3390000 | 2018-03-05 | 07:07:05 | 53.195031 | 50.106952 | 3106 | 2 | 4 | 24 | 2 | 64.0 | 13.0 | 11 |
9 | 2800000 | 2018-03-06 | 09:57:10 | 55.736972 | 38.846457 | 81 | 1 | 9 | 10 | 2 | 55.0 | 8.0 | 1 |
=True) df.info(show_counts
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5477006 entries, 0 to 5477005
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 price 5477006 non-null int64
1 date 5477006 non-null object
2 time 5477006 non-null object
3 geo_lat 5477006 non-null float64
4 geo_lon 5477006 non-null float64
5 region 5477006 non-null int64
6 building_type 5477006 non-null int64
7 level 5477006 non-null int64
8 levels 5477006 non-null int64
9 rooms 5477006 non-null int64
10 area 5477006 non-null float64
11 kitchen_area 5477006 non-null float64
12 object_type 5477006 non-null int64
dtypes: float64(4), int64(7), object(2)
memory usage: 543.2+ MB
'region'] = df['region'].astype('category')
df['building_type'] = df['building_type'].astype('category')
df['object_type'] = df['object_type'].astype('category') df[
'level'] = df['level'].astype('int8')
df['levels'] = df['levels'].astype('int8')
df['rooms'] = df['rooms'].astype('int8')
df['rooms'] = df['rooms'].astype('int8') df[
'area'] = df['area'].astype('float16')
df['kitchen_area'] = df['kitchen_area'].astype('float16')
df['kitchen_area'] = df['kitchen_area'].astype('float16')
df['geo_lat'] = df['geo_lat'].astype('float32')
df['geo_lon'] = df['geo_lon'].astype('float32') df[
df.describe()
/home/andrey/work/institute/MLE/assets/eda/.venv_eda/lib/python3.10/site-packages/pandas/core/nanops.py:1487: RuntimeWarning: overflow encountered in cast
return dtype.type(n)
/home/andrey/work/institute/MLE/assets/eda/.venv_eda/lib/python3.10/site-packages/numpy/_core/_methods.py:53: RuntimeWarning: overflow encountered in reduce
return umr_sum(a, axis, dtype, out, keepdims, initial, where)
/home/andrey/work/institute/MLE/assets/eda/.venv_eda/lib/python3.10/site-packages/pandas/core/nanops.py:731: RuntimeWarning: invalid value encountered in scalar divide
the_mean = the_sum / count if count > 0 else np.nan
/home/andrey/work/institute/MLE/assets/eda/.venv_eda/lib/python3.10/site-packages/pandas/core/nanops.py:1487: RuntimeWarning: overflow encountered in cast
return dtype.type(n)
/home/andrey/work/institute/MLE/assets/eda/.venv_eda/lib/python3.10/site-packages/numpy/_core/_methods.py:53: RuntimeWarning: overflow encountered in reduce
return umr_sum(a, axis, dtype, out, keepdims, initial, where)
/home/andrey/work/institute/MLE/assets/eda/.venv_eda/lib/python3.10/site-packages/pandas/core/nanops.py:731: RuntimeWarning: invalid value encountered in scalar divide
the_mean = the_sum / count if count > 0 else np.nan
price | geo_lat | geo_lon | level | levels | rooms | area | kitchen_area | |
---|---|---|---|---|---|---|---|---|
count | 5.477006e+06 | 5.477006e+06 | 5.477006e+06 | 5.477006e+06 | 5.477006e+06 | 5.477006e+06 | 5.477006e+06 | 5.477006e+06 |
mean | 4.422029e+06 | 5.403825e+01 | 5.324431e+01 | 6.214530e+00 | 1.139892e+01 | 1.726173e+00 | NaN | NaN |
std | 2.150752e+07 | 4.622758e+00 | 2.074763e+01 | 4.957419e+00 | 6.535734e+00 | 1.082133e+00 | 0.000000e+00 | 0.000000e+00 |
min | -2.144967e+09 | 4.145906e+01 | 1.989020e+01 | 1.000000e+00 | 1.000000e+00 | -2.000000e+00 | 7.000732e-02 | 1.000214e-02 |
25% | 1.950000e+06 | 5.337767e+01 | 3.777790e+01 | 2.000000e+00 | 5.000000e+00 | 1.000000e+00 | 3.800000e+01 | 7.000000e+00 |
50% | 2.990000e+06 | 5.517139e+01 | 4.306774e+01 | 5.000000e+00 | 1.000000e+01 | 2.000000e+00 | 4.803125e+01 | 9.703125e+00 |
75% | 4.802000e+06 | 5.622613e+01 | 6.564895e+01 | 9.000000e+00 | 1.600000e+01 | 2.000000e+00 | 6.312500e+01 | 1.270312e+01 |
max | 2.147484e+09 | 7.198040e+01 | 1.625361e+02 | 3.900000e+01 | 3.900000e+01 | 1.000000e+01 | 7.856000e+03 | 1.000000e+04 |
= df.select_dtypes(include=['category']).columns.to_list()
cat_features cat_features
['region', 'building_type', 'object_type']
= df.select_dtypes(include=['number']).columns.to_list()
num_features num_features
['price',
'geo_lat',
'geo_lon',
'level',
'levels',
'rooms',
'area',
'kitchen_area']
= df.drop(columns=['date', 'time']) df
=True) # уменьшили размер в 3 раза (135 Мб против 543) df.info(show_counts
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5477006 entries, 0 to 5477005
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 price 5477006 non-null int64
1 geo_lat 5477006 non-null float32
2 geo_lon 5477006 non-null float32
3 region 5477006 non-null category
4 building_type 5477006 non-null category
5 level 5477006 non-null int8
6 levels 5477006 non-null int8
7 rooms 5477006 non-null int8
8 area 5477006 non-null float16
9 kitchen_area 5477006 non-null float16
10 object_type 5477006 non-null category
dtypes: category(3), float16(2), float32(2), int64(1), int8(3)
memory usage: 135.8 MB
for cat in cat_features:
print(f'{cat} - numer of unique = {df[cat].nunique()}')
region - numer of unique = 84
building_type - numer of unique = 6
object_type - numer of unique = 2
for col in cat_features:
print(f'Unique categories in {col}: {df[col].value_counts()}')
Unique categories in region: region
9654 1049435
2843 637224
81 500368
2661 461820
3 439511
...
16705 139
69 77
4963 65
1901 12
61888 5
Name: count, Length: 84, dtype: int64
Unique categories in building_type: building_type
1 1955661
3 1892756
2 1130731
0 307165
4 174356
5 16337
Name: count, dtype: int64
Unique categories in object_type: object_type
1 3863809
11 1613197
Name: count, dtype: int64
Очистка данных
= df.loc[df['price'] > 50000]
df = df.query('area >= 8')
df = df.query('kitchen_area >= 3')
df = df.query('kitchen_area < area')
df = df.query('rooms != -2')
df = df.query('level <= levels') df
# меняем признак "студии" -1 на 0
'rooms'] == -1, 'rooms'] = 0 df.loc[df[
Анализ признаков для модели
https://seaborn.pydata.org/examples/index.html - галерея примеров
displot
='price', bins=100, hue='building_type', kde=True, log_scale=True) sns.displot(df, x
<seaborn.axisgrid.FacetGrid at 0x7df3e5371c00>
# Создаем новый признак - уровень высоты этажа в здании
'floor_level'] = df.apply(lambda x: 'low' if x['level']/x['levels'] <= 0.3 \
df[else 'hi' if x['level']/x['levels'] > 0.7 \
else 'mid', axis = 1)
'floor_level'] = df.apply(lambda x: 'first' if x['level'] == 1 \
df[else 'last' if x['level'] == x['levels'] \
else x['floor_level'], axis = 1)
'floor_level'].value_counts() df[
floor_level
mid 2092890
hi 1208629
low 740915
last 697099
first 654013
Name: count, dtype: int64
='price', bins=100, hue='floor_level', kde=True, log_scale=True) sns.displot(df, x
<seaborn.axisgrid.FacetGrid at 0x7df3e55f4be0>
heatmap
= df[num_features].corr()
feature_correlation =True) sns.heatmap(feature_correlation, annot
<Axes: >
Групповые операции
def flat_index(df_stats):
= df_stats.columns.get_level_values(0) + '_' + df_stats.columns.get_level_values(1)
df_stats.columns = df_stats.columns.to_flat_index()
df_stats.columns =True)
df_stats.reset_index(inplacereturn df_stats
= df[num_features+['building_type']].groupby(by='building_type').agg(['mean', 'std'])
aggregated_df = flat_index(aggregated_df)
aggregated_df aggregated_df
/tmp/ipykernel_1524625/3084051676.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
aggregated_df = df[num_features+['building_type']].groupby(by='building_type').agg(['mean', 'std'])
building_type | price_mean | price_std | geo_lat_mean | geo_lat_std | geo_lon_mean | geo_lon_std | level_mean | level_std | levels_mean | levels_std | rooms_mean | rooms_std | area_mean | area_std | kitchen_area_mean | kitchen_area_std | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 5.284833e+06 | 1.279574e+07 | 53.747444 | 5.098192 | 55.518322 | 22.517204 | 8.851292 | 5.923950 | 16.738440 | 6.499030 | 1.686900 | 0.934588 | 58.559349 | 37.077482 | 12.792236 | 6.599672 |
1 | 1 | 3.512978e+06 | 1.952254e+07 | 54.790298 | 3.357936 | 61.443077 | 21.923780 | 5.571202 | 4.060143 | 10.213742 | 4.642660 | 1.794023 | 0.966615 | 51.407722 | 25.746609 | 9.298869 | 4.062081 |
2 | 2 | 7.126816e+06 | 1.267507e+07 | 53.999779 | 5.346035 | 42.106968 | 13.565219 | 9.031732 | 6.130808 | 16.850670 | 6.862993 | 1.697287 | 0.928301 | 57.157982 | 32.994183 | 13.154365 | 6.146328 |
3 | 3 | 4.068649e+06 | 8.331654e+06 | 53.556213 | 4.910420 | 51.249592 | 19.097178 | 4.929223 | 3.997643 | 8.741839 | 5.626125 | 1.878324 | 0.944347 | 55.062199 | 36.979730 | 10.442657 | 6.912905 |
4 | 4 | 4.004994e+06 | 3.367351e+07 | 51.181896 | 5.909806 | 47.777088 | 17.570242 | 5.430987 | 4.308443 | 9.746289 | 5.743188 | 1.811687 | 0.892482 | 52.034588 | 42.411056 | 10.051356 | 4.793207 |
5 | 5 | 2.618608e+06 | 1.543122e+07 | 56.440563 | 3.981515 | 63.951946 | 28.464556 | 2.472882 | 3.850120 | 3.766808 | 5.931042 | 2.031180 | 0.907434 | 50.446354 | 52.361071 | 9.285330 | 4.603177 |
lineplot
='building_type',y='price_mean', label='mean') sns.lineplot(aggregated_df,x
<Axes: xlabel='building_type', ylabel='price_mean'>
subplots
= plt.pyplot.subplots(2,2)
fig, axs =1)
fig.tight_layout(pad16.5, 14, forward=True)
fig.set_size_inches(
='building_type',y='price_mean', label='mean', ax=axs[0,0])
sns.lineplot(aggregated_df,x
='building_type',y='area_mean', label='area',ax=axs[0,1])
sns.lineplot(aggregated_df,x='building_type',y='kitchen_area_mean', label='kitchen_area_mean',ax=axs[0,1])
sns.lineplot(aggregated_df,x
='building_type',y='level_mean', label='level_mean',ax=axs[1,0])
sns.lineplot(aggregated_df,x
# Группировка непосредственно в seaborn
='building_type', y='area', label='area', ax=axs[1,1]) sns.lineplot(df, x
<Axes: xlabel='building_type', ylabel='area'>
Bokeh
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, Legend
from bokeh.io import output_notebook
output_notebook()
""
= ColumnDataSource(data=df.sample(frac=0.1))
source = figure(width=1000)
p =source, x='area', y='price' )
p.scatter(source= HoverTool(tooltips=[('flat area ', '@area'),
hover 'price', '@price'),
('level', '@level')])
(
p.add_tools(hover) show(p)
from bokeh.transform import factor_cmap
= df['floor_level'].unique().tolist()
types = ColumnDataSource(data=df.sample(frac=0.1))
source = figure(width=1000)
p =source, x='area', y='level', color=factor_cmap('floor_level', 'Category10_5', types) )
p.scatter(source= HoverTool(tooltips=[('area', '@area'),
hover 'price', '@price'),
('level', '@level')])
(
p.add_tools(hover)
show(p)
Save clean dataset
# Сохраняем обработанный датафрейм, чтобы на следующих этапах не проводить повторно ту же обработку.
# Лучше сохранить в pickle формате, чтобы сохранились все типы данных, в т.ч. category
'./data/clean_data.pkl') df.to_pickle(
# Считать можно так:
= pd.read_pickle('./data/clean_data.pkl')
df df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 5393546 entries, 0 to 5477005
Data columns (total 11 columns):
# Column Dtype
--- ------ -----
0 price int64
1 geo_lat float32
2 geo_lon float32
3 region category
4 building_type category
5 level int8
6 levels int8
7 rooms int8
8 area float16
9 kitchen_area float16
10 object_type category
dtypes: category(3), float16(2), float32(2), int64(1), int8(3)
memory usage: 174.9 MB
Выводы после EDA
Выводы, полученные в ходе анализа:
- перечислить все действия, проведенные на этапе очистки данных
- Создавались ли новые признаки?
- Какие закономерности выявлены по графикам, которые могут быть полезны в дальнейшем для решния задачи?