{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "e312113e", "metadata": {}, "outputs": [], "source": [ "\n", "import pandas as pd\n", "import matplotlib as plt\n", "import seaborn as sns\n", "import numpy as np" ] }, { "cell_type": "markdown", "id": "de2c028d", "metadata": {}, "source": [ "# Загрузка и знакомство с данными" ] }, { "cell_type": "code", "execution_count": 3, "id": "5cd00195", "metadata": {}, "outputs": [], "source": [ "# dataset https://www.kaggle.com/datasets/mrdaniilak/russia-real-estate-20182021/data \n", "\n", "df = pd.read_csv('data/all_v2.csv')\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "05b57100", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
pricedatetimegeo_latgeo_lonregionbuilding_typelevellevelsroomsareakitchen_areaobject_type
060500002018-02-1920:00:2159.80580830.37614126611810382.610.81
186500002018-02-2712:04:5455.68380737.297405813524269.112.01
240000002018-02-2815:44:0056.29525044.0616372871159366.010.01
318500002018-03-0111:24:5244.99613239.074783284341216238.05.011
454500002018-03-0117:42:4355.91876737.9846428131314260.010.01
533000002018-03-0221:18:4255.90825337.72644881145132.06.01
647042802018-03-0412:35:2555.62109737.43100232125131.76.011
736000002018-03-0420:52:3859.87552630.3954572661125131.16.01
833900002018-03-0507:07:0553.19503150.10695231062424264.013.011
928000002018-03-0609:57:1055.73697238.846457811910255.08.01
\n", "
" ], "text/plain": [ " price date time geo_lat geo_lon region building_type \\\n", "0 6050000 2018-02-19 20:00:21 59.805808 30.376141 2661 1 \n", "1 8650000 2018-02-27 12:04:54 55.683807 37.297405 81 3 \n", "2 4000000 2018-02-28 15:44:00 56.295250 44.061637 2871 1 \n", "3 1850000 2018-03-01 11:24:52 44.996132 39.074783 2843 4 \n", "4 5450000 2018-03-01 17:42:43 55.918767 37.984642 81 3 \n", "5 3300000 2018-03-02 21:18:42 55.908253 37.726448 81 1 \n", "6 4704280 2018-03-04 12:35:25 55.621097 37.431002 3 2 \n", "7 3600000 2018-03-04 20:52:38 59.875526 30.395457 2661 1 \n", "8 3390000 2018-03-05 07:07:05 53.195031 50.106952 3106 2 \n", "9 2800000 2018-03-06 09:57:10 55.736972 38.846457 81 1 \n", "\n", " level levels rooms area kitchen_area object_type \n", "0 8 10 3 82.6 10.8 1 \n", "1 5 24 2 69.1 12.0 1 \n", "2 5 9 3 66.0 10.0 1 \n", "3 12 16 2 38.0 5.0 11 \n", "4 13 14 2 60.0 10.0 1 \n", "5 4 5 1 32.0 6.0 1 \n", "6 1 25 1 31.7 6.0 11 \n", "7 2 5 1 31.1 6.0 1 \n", "8 4 24 2 64.0 13.0 11 \n", "9 9 10 2 55.0 8.0 1 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head(10)" ] }, { "cell_type": "markdown", "id": "6c892b3e", "metadata": {}, "source": [ "# Очистка данных" ] }, { "cell_type": "markdown", "id": "a3d3ad69", "metadata": {}, "source": [ "# Анализ признаков для модели\n", "\n", "https://seaborn.pydata.org/examples/index.html - галерея примеров" ] }, { "cell_type": "markdown", "id": "78845b8b", "metadata": {}, "source": [ "## histplot" ] }, { "cell_type": "markdown", "id": "9318a819", "metadata": {}, "source": [ "## heatmap" ] }, { "cell_type": "markdown", "id": "f4ab2ef2", "metadata": {}, "source": [ "# Групповые операции" ] }, { "cell_type": "code", "execution_count": 19, "id": "11e4da4e", "metadata": {}, "outputs": [], "source": [ "def flat_index(df_stats): \n", " df_stats.columns = df_stats.columns.get_level_values(0) + '_' + df_stats.columns.get_level_values(1) \n", " df_stats.columns = df_stats.columns.to_flat_index() \n", " df_stats.reset_index(inplace=True) \n", " return df_stats" ] }, { "cell_type": "markdown", "id": "0fbb62de", "metadata": {}, "source": [ "## lineplot" ] }, { "cell_type": "markdown", "id": "b8bc652d", "metadata": {}, "source": [ "## subplots" ] }, { "cell_type": "code", "execution_count": null, "id": "7fab88d2", "metadata": {}, "outputs": [], "source": [ "fig, axs = plt.pyplot.subplots(2,2)\n", "fig.tight_layout(pad=1)\n", "fig.set_size_inches(16.5, 14, forward=True)\n", "\n" ] }, { "cell_type": "markdown", "id": "ba7d6b7c", "metadata": {}, "source": [ "## displot" ] }, { "cell_type": "code", "execution_count": null, "id": "eca41c1e", "metadata": {}, "outputs": [], "source": [ "for col in categorial_cols:\n", " print(f'Unique categories in {col}: {df[col].nunique()}')dd" ] }, { "cell_type": "markdown", "id": "ef2501d0", "metadata": {}, "source": [ "## histplot" ] }, { "cell_type": "markdown", "id": "5d64d58a", "metadata": {}, "source": [ "# Bokeh\n", "https://bokeh.org/" ] }, { "cell_type": "code", "execution_count": null, "id": "fddb38a2", "metadata": {}, "outputs": [], "source": [ "from bokeh.plotting import figure, show\n", "from bokeh.models import ColumnDataSource, HoverTool, Legend\n", "from bokeh.io import output_notebook \n", "output_notebook()" ] }, { "cell_type": "markdown", "id": "3a7cbaaa", "metadata": {}, "source": [ "# Выводы после EDA" ] }, { "cell_type": "markdown", "id": "695334e5", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv_sprint02", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 5 }