Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
12 KiB
12 KiB
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np
Загрузка и знакомство с данными
# dataset https://www.kaggle.com/datasets/mrdaniilak/russia-real-estate-20182021/data
= pd.read_csv('data/all_v2.csv') df
10) df.head(
price | date | time | geo_lat | geo_lon | region | building_type | level | levels | rooms | area | kitchen_area | object_type | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 6050000 | 2018-02-19 | 20:00:21 | 59.805808 | 30.376141 | 2661 | 1 | 8 | 10 | 3 | 82.6 | 10.8 | 1 |
1 | 8650000 | 2018-02-27 | 12:04:54 | 55.683807 | 37.297405 | 81 | 3 | 5 | 24 | 2 | 69.1 | 12.0 | 1 |
2 | 4000000 | 2018-02-28 | 15:44:00 | 56.295250 | 44.061637 | 2871 | 1 | 5 | 9 | 3 | 66.0 | 10.0 | 1 |
3 | 1850000 | 2018-03-01 | 11:24:52 | 44.996132 | 39.074783 | 2843 | 4 | 12 | 16 | 2 | 38.0 | 5.0 | 11 |
4 | 5450000 | 2018-03-01 | 17:42:43 | 55.918767 | 37.984642 | 81 | 3 | 13 | 14 | 2 | 60.0 | 10.0 | 1 |
5 | 3300000 | 2018-03-02 | 21:18:42 | 55.908253 | 37.726448 | 81 | 1 | 4 | 5 | 1 | 32.0 | 6.0 | 1 |
6 | 4704280 | 2018-03-04 | 12:35:25 | 55.621097 | 37.431002 | 3 | 2 | 1 | 25 | 1 | 31.7 | 6.0 | 11 |
7 | 3600000 | 2018-03-04 | 20:52:38 | 59.875526 | 30.395457 | 2661 | 1 | 2 | 5 | 1 | 31.1 | 6.0 | 1 |
8 | 3390000 | 2018-03-05 | 07:07:05 | 53.195031 | 50.106952 | 3106 | 2 | 4 | 24 | 2 | 64.0 | 13.0 | 11 |
9 | 2800000 | 2018-03-06 | 09:57:10 | 55.736972 | 38.846457 | 81 | 1 | 9 | 10 | 2 | 55.0 | 8.0 | 1 |
Очистка данных
Анализ признаков для модели
https://seaborn.pydata.org/examples/index.html - галерея примеров
histplot
heatmap
Групповые операции
def flat_index(df_stats):
= df_stats.columns.get_level_values(0) + '_' + df_stats.columns.get_level_values(1)
df_stats.columns = df_stats.columns.to_flat_index()
df_stats.columns =True)
df_stats.reset_index(inplacereturn df_stats
lineplot
subplots
= plt.pyplot.subplots(2,2)
fig, axs =1)
fig.tight_layout(pad16.5, 14, forward=True)
fig.set_size_inches(
displot
for col in categorial_cols:
print(f'Unique categories in {col}: {df[col].nunique()}')dd
histplot
Bokeh
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource, HoverTool, Legend
from bokeh.io import output_notebook
output_notebook()