diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..42bccd7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +invisible* +.venv* +.~lock* diff --git a/README.md b/README.md new file mode 100644 index 0000000..92b96bb --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# Интеллектуальные информационные системы + +## Лекции + +| Дата |Лекция | +|:----------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 05.09.2024 | [Вводная лекция](lectures/lec1.odp) | +| 12.09.2024 | [Изолирование окружения. Docker](lectures/lec2-Docker.odp) | +| 19.09.2024 | [Разведочный анализ данных](lectures/lec3-eda) | + +## Лабораторные работы diff --git a/assets/docker/Dockerfile b/assets/docker/Dockerfile new file mode 100644 index 0000000..e079d9e --- /dev/null +++ b/assets/docker/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.11-slim + +COPY . /my_app + +WORKDIR /my_app + +RUN pip install tqdm + +ENTRYPOINT [ "python", "main.py" ] \ No newline at end of file diff --git a/assets/docker/main.py b/assets/docker/main.py new file mode 100644 index 0000000..92ea01b --- /dev/null +++ b/assets/docker/main.py @@ -0,0 +1,6 @@ +import sys + +def main(a = 3, b = 5): + print(f"multiply {a} by {b} is {a * b}") + +main(int(sys.argv[1]), int(sys.argv[2])) \ No newline at end of file diff --git a/assets/eda/eda.ipynb b/assets/eda/eda.ipynb new file mode 100644 index 0000000..726c7f5 --- /dev/null +++ b/assets/eda/eda.ipynb @@ -0,0 +1,449 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "e312113e", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "import pandas as pd\n", + "import matplotlib as plt\n", + "import seaborn as sns\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "id": "de2c028d", + "metadata": {}, + "source": [ + "# Загрузка и знакомство с данными" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5cd00195", + "metadata": {}, + "outputs": [], + "source": [ + "# dataset https://www.kaggle.com/datasets/mrdaniilak/russia-real-estate-20182021/data \n", + "\n", + "df = pd.read_csv('data/all_v2.csv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "05b57100", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pricedatetimegeo_latgeo_lonregionbuilding_typelevellevelsroomsareakitchen_areaobject_type
060500002018-02-1920:00:2159.80580830.37614126611810382.610.81
186500002018-02-2712:04:5455.68380737.297405813524269.112.01
240000002018-02-2815:44:0056.29525044.0616372871159366.010.01
318500002018-03-0111:24:5244.99613239.074783284341216238.05.011
454500002018-03-0117:42:4355.91876737.9846428131314260.010.01
533000002018-03-0221:18:4255.90825337.72644881145132.06.01
647042802018-03-0412:35:2555.62109737.43100232125131.76.011
736000002018-03-0420:52:3859.87552630.3954572661125131.16.01
833900002018-03-0507:07:0553.19503150.10695231062424264.013.011
928000002018-03-0609:57:1055.73697238.846457811910255.08.01
\n", + "
" + ], + "text/plain": [ + " price date time geo_lat geo_lon region building_type \\\n", + "0 6050000 2018-02-19 20:00:21 59.805808 30.376141 2661 1 \n", + "1 8650000 2018-02-27 12:04:54 55.683807 37.297405 81 3 \n", + "2 4000000 2018-02-28 15:44:00 56.295250 44.061637 2871 1 \n", + "3 1850000 2018-03-01 11:24:52 44.996132 39.074783 2843 4 \n", + "4 5450000 2018-03-01 17:42:43 55.918767 37.984642 81 3 \n", + "5 3300000 2018-03-02 21:18:42 55.908253 37.726448 81 1 \n", + "6 4704280 2018-03-04 12:35:25 55.621097 37.431002 3 2 \n", + "7 3600000 2018-03-04 20:52:38 59.875526 30.395457 2661 1 \n", + "8 3390000 2018-03-05 07:07:05 53.195031 50.106952 3106 2 \n", + "9 2800000 2018-03-06 09:57:10 55.736972 38.846457 81 1 \n", + "\n", + " level levels rooms area kitchen_area object_type \n", + "0 8 10 3 82.6 10.8 1 \n", + "1 5 24 2 69.1 12.0 1 \n", + "2 5 9 3 66.0 10.0 1 \n", + "3 12 16 2 38.0 5.0 11 \n", + "4 13 14 2 60.0 10.0 1 \n", + "5 4 5 1 32.0 6.0 1 \n", + "6 1 25 1 31.7 6.0 11 \n", + "7 2 5 1 31.1 6.0 1 \n", + "8 4 24 2 64.0 13.0 11 \n", + "9 9 10 2 55.0 8.0 1 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "6c892b3e", + "metadata": {}, + "source": [ + "# Очистка данных" + ] + }, + { + "cell_type": "markdown", + "id": "a3d3ad69", + "metadata": {}, + "source": [ + "# Анализ признаков для модели\n", + "\n", + "https://seaborn.pydata.org/examples/index.html - галерея примеров" + ] + }, + { + "cell_type": "markdown", + "id": "78845b8b", + "metadata": {}, + "source": [ + "## histplot" + ] + }, + { + "cell_type": "markdown", + "id": "9318a819", + "metadata": {}, + "source": [ + "## heatmap" + ] + }, + { + "cell_type": "markdown", + "id": "f4ab2ef2", + "metadata": {}, + "source": [ + "# Групповые операции" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "11e4da4e", + "metadata": {}, + "outputs": [], + "source": [ + "def flat_index(df_stats): \n", + " df_stats.columns = df_stats.columns.get_level_values(0) + '_' + df_stats.columns.get_level_values(1) \n", + " df_stats.columns = df_stats.columns.to_flat_index() \n", + " df_stats.reset_index(inplace=True) \n", + " return df_stats" + ] + }, + { + "cell_type": "markdown", + "id": "0fbb62de", + "metadata": {}, + "source": [ + "## lineplot" + ] + }, + { + "cell_type": "markdown", + "id": "b8bc652d", + "metadata": {}, + "source": [ + "## subplots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fab88d2", + "metadata": {}, + "outputs": [], + "source": [ + "fig, axs = plt.pyplot.subplots(2,2)\n", + "fig.tight_layout(pad=1)\n", + "fig.set_size_inches(16.5, 14, forward=True)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "ba7d6b7c", + "metadata": {}, + "source": [ + "## displot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eca41c1e", + "metadata": {}, + "outputs": [], + "source": [ + "for col in categorial_cols:\n", + " print(f'Unique categories in {col}: {df[col].nunique()}')dd" + ] + }, + { + "cell_type": "markdown", + "id": "ef2501d0", + "metadata": {}, + "source": [ + "## histplot" + ] + }, + { + "cell_type": "markdown", + "id": "5d64d58a", + "metadata": {}, + "source": [ + "# Bokeh\n", + "https://bokeh.org/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fddb38a2", + "metadata": {}, + "outputs": [], + "source": [ + "from bokeh.plotting import figure, show\n", + "from bokeh.models import ColumnDataSource, HoverTool, Legend\n", + "from bokeh.io import output_notebook \n", + "output_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "3a7cbaaa", + "metadata": {}, + "source": [ + "# Выводы после EDA" + ] + }, + { + "cell_type": "markdown", + "id": "695334e5", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv_sprint02", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/eda/requirements.txt b/assets/eda/requirements.txt new file mode 100644 index 0000000..df2aea8 --- /dev/null +++ b/assets/eda/requirements.txt @@ -0,0 +1,4 @@ +pandas +bokeh +matplotlib +seaborn \ No newline at end of file diff --git a/assets/mlflow/requirements b/assets/mlflow/requirements new file mode 100644 index 0000000..acede8f --- /dev/null +++ b/assets/mlflow/requirements @@ -0,0 +1 @@ +mlflow==2.7.1 diff --git a/assets/virtual_env/proj1/req.txt b/assets/virtual_env/proj1/req.txt new file mode 100644 index 0000000..b5dcdea --- /dev/null +++ b/assets/virtual_env/proj1/req.txt @@ -0,0 +1,2 @@ +tdqm==0.0.1 +tqdm==4.66.5 diff --git a/assets/virtual_env/proj1/requirements.txt b/assets/virtual_env/proj1/requirements.txt new file mode 100644 index 0000000..78620c4 --- /dev/null +++ b/assets/virtual_env/proj1/requirements.txt @@ -0,0 +1 @@ +tqdm diff --git a/lectures/lec1.odp b/lectures/lec1.odp new file mode 100644 index 0000000..0e757fc Binary files /dev/null and b/lectures/lec1.odp differ diff --git a/lectures/lec2-docker.odp b/lectures/lec2-docker.odp new file mode 100644 index 0000000..34ced31 Binary files /dev/null and b/lectures/lec2-docker.odp differ diff --git a/lectures/lec3-eda.odp b/lectures/lec3-eda.odp new file mode 100644 index 0000000..05ae0fc Binary files /dev/null and b/lectures/lec3-eda.odp differ