diff --git a/README.md b/README.md index 16007a6..37e26da 100644 --- a/README.md +++ b/README.md @@ -8,9 +8,10 @@ | 12.09.2024 | [Изолирование окружения. Docker](./lectures/lec2-docker.odp) - [в формате pptx](./lectures/lec2-docker.pptx) | | 19.09.2024 | [Разведочный анализ данных](./lectures/lec3-eda.odp) - [в формате pptx](./lectures/lec3-eda.pptx) | | 26.09.2024 | [MLFlow](./lectures/lec3-eda.odp) - [в формате pptx](./lectures/lec4-mlflow.pptx) | +| 26.09.2024 | [MLFlow - практика](./assets/mlflow/mlflow.ipynb) | ## Лабораторные работы [Лабораторная работа 1.](./labs/lab1.md) Настройка окружения и раведочный анализ данных -## [Журнал](https://docs.google.com/spreadsheets/d/10juwyGqOhiD_czxfVziLj10aHYTaCp5oDmhesBJkxYM/edit?gid=1516016995#gid=1516016995) \ No newline at end of file +## [Журнал](https://docs.google.com/spreadsheets/d/10juwyGqOhiD_czxfVziLj10aHYTaCp5oDmhesBJkxYM/edit?gid=1516016995#gid=1516016995) \ No newline at end of file diff --git a/assets/mlflow/comment.txt b/assets/mlflow/comment.txt new file mode 100644 index 0000000..f0a4963 --- /dev/null +++ b/assets/mlflow/comment.txt @@ -0,0 +1 @@ +Model for estate \ No newline at end of file diff --git a/assets/mlflow/requirements b/assets/mlflow/requirements deleted file mode 100644 index fcbe262..0000000 --- a/assets/mlflow/requirements +++ /dev/null @@ -1,2 +0,0 @@ -mlflow==2.7.1 -scikit-learn diff --git a/assets/mlflow/requirements.txt b/assets/mlflow/requirements.txt new file mode 100644 index 0000000..971bb15 --- /dev/null +++ b/assets/mlflow/requirements.txt @@ -0,0 +1,3 @@ +numpy==2.1.1 +mlflow==2.16 +scikit-learn \ No newline at end of file diff --git a/assets/mlflow/research.ipynb b/assets/mlflow/research.ipynb new file mode 100644 index 0000000..c1ca6cf --- /dev/null +++ b/assets/mlflow/research.ipynb @@ -0,0 +1,2010 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import mlflow\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "import numpy\n", + "\n", + "from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n", + "\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.linear_model import LinearRegression\n", + "\n", + "from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 539355 entries, 1979096 to 5189500\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 price 539355 non-null int64 \n", + " 1 geo_lat 539355 non-null float32 \n", + " 2 geo_lon 539355 non-null float32 \n", + " 3 region 539355 non-null category\n", + " 4 building_type 539355 non-null category\n", + " 5 level 539355 non-null int8 \n", + " 6 levels 539355 non-null int8 \n", + " 7 rooms 539355 non-null int8 \n", + " 8 area 539355 non-null float16 \n", + " 9 kitchen_area 539355 non-null float16 \n", + " 10 object_type 539355 non-null category\n", + " 11 floor_level 539355 non-null object \n", + "dtypes: category(3), float16(2), float32(2), int64(1), int8(3), object(1)\n", + "memory usage: 21.6+ MB\n" + ] + } + ], + "source": [ + "df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции\n", + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.rename(columns={'price': 'target'})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/pandas/io/formats/format.py:1458: RuntimeWarning: overflow encountered in cast\n", + " has_large_values = (abs_vals > 1e6).any()\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
targetgeo_latgeo_lonregionbuilding_typelevellevelsroomsareakitchen_areaobject_typefloor_level
1979096130000052.82109883.1130376817111366.5000010.0000001first
1833303880000055.70753937.467068311516246.000007.0000001hi
1494335195800054.98840082.783691965421317136.5000011.96093811hi
2747476146160053.29855350.3263823106355132.593759.60156211last
5027275300000042.89793447.62482540073410270.0000012.00000011mid
.......................................
2476626149000054.94380682.95787096541210148.0625014.00000011low
14874541900000055.77224037.731136334123100.0000013.0000001mid
2772844120000054.47459053.5318072722159132.093757.0000001mid
3982304230000055.37826539.05331081115249.000009.0000001first
5189500915773055.54295737.47991931817252.3125017.59375011mid
\n", + "

539355 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " target geo_lat geo_lon region building_type level levels \\\n", + "1979096 1300000 52.821098 83.113037 6817 1 1 1 \n", + "1833303 8800000 55.707539 37.467068 3 1 15 16 \n", + "1494335 1958000 54.988400 82.783691 9654 2 13 17 \n", + "2747476 1461600 53.298553 50.326382 3106 3 5 5 \n", + "5027275 3000000 42.897934 47.624825 4007 3 4 10 \n", + "... ... ... ... ... ... ... ... \n", + "2476626 1490000 54.943806 82.957870 9654 1 2 10 \n", + "1487454 19000000 55.772240 37.731136 3 3 4 12 \n", + "2772844 1200000 54.474590 53.531807 2722 1 5 9 \n", + "3982304 2300000 55.378265 39.053310 81 1 1 5 \n", + "5189500 9157730 55.542957 37.479919 3 1 8 17 \n", + "\n", + " rooms area kitchen_area object_type floor_level \n", + "1979096 3 66.50000 10.000000 1 first \n", + "1833303 2 46.00000 7.000000 1 hi \n", + "1494335 1 36.50000 11.960938 11 hi \n", + "2747476 1 32.59375 9.601562 11 last \n", + "5027275 2 70.00000 12.000000 11 mid \n", + "... ... ... ... ... ... \n", + "2476626 1 48.06250 14.000000 11 low \n", + "1487454 3 100.00000 13.000000 1 mid \n", + "2772844 1 32.09375 7.000000 1 mid \n", + "3982304 2 49.00000 9.000000 1 first \n", + "5189500 2 52.31250 17.593750 11 mid \n", + "\n", + "[539355 rows x 12 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['region', 'building_type', 'object_type', 'floor_level']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()\n", + "cat_features" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "num_features = X_train.select_dtypes(include=['number']).columns.to_list()\n", + "num_features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "s_scaler = StandardScaler()\n", + "l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999) # unknown_value нужно выбирать с умом\n", + "regressor = RandomForestRegressor(n_estimators=20, max_depth=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Column transformer" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Для удобной работы со столбцами\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", + " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", + " ],\n", + " remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 ColumnTransformer(transformers=[('num', StandardScaler(),\n",
+       "                                                  ['geo_lat', 'geo_lon',\n",
+       "                                                   'level', 'levels', 'rooms',\n",
+       "                                                   'area', 'kitchen_area']),\n",
+       "                                                 ('cat',\n",
+       "                                                  OrdinalEncoder(handle_unknown='use_encoded_value',\n",
+       "                                                                 unknown_value=999),\n",
+       "                                                  ['region', 'building_type',\n",
+       "                                                   'object_type',\n",
+       "                                                   'floor_level'])])),\n",
+       "                ('model',\n",
+       "                 RandomForestRegressor(max_depth=10, n_estimators=20))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num', StandardScaler(),\n", + " ['geo_lat', 'geo_lon',\n", + " 'level', 'levels', 'rooms',\n", + " 'area', 'kitchen_area']),\n", + " ('cat',\n", + " OrdinalEncoder(handle_unknown='use_encoded_value',\n", + " unknown_value=999),\n", + " ['region', 'building_type',\n", + " 'object_type',\n", + " 'floor_level'])])),\n", + " ('model',\n", + " RandomForestRegressor(max_depth=10, n_estimators=20))])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n", + " ('model', regressor)])\n", + "\n", + "pipeline.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mae': np.float64(1276343.108894747),\n", + " 'mape': np.float64(0.35471390164231303),\n", + " 'mse': np.float64(174567675833231.12)}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = pipeline.predict(X_test) \n", + "\n", + "metrics = {}\n", + "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", + "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", + "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", + "\n", + "metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Работаем с MLflow локально\n", + "TRACKING_SERVER_HOST = \"127.0.0.1\"\n", + "TRACKING_SERVER_PORT = 5000\n", + "\n", + "registry_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n", + "tracking_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n", + "\n", + "mlflow.set_tracking_uri(tracking_uri) \n", + "mlflow.set_registry_uri(registry_uri) \n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться\n", + "EXPERIMENT_NAME = \"estate_project\"\n", + "RUN_NAME = \"baseline model\"\n", + "REGISTRY_MODEL_NAME = \"estate_model_rf\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Логируем вручную" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их\n", + "from mlflow.models import infer_signature\n", + "\n", + "signature = infer_signature(model_input = X_train.head(5))\n", + "input_example = X_train.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "# Будем логировать requirements и артефакт - текстовый файл\n", + "req_file = 'requirements.txt'\n", + "art = 'comment.txt'" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели\n", + "#params_dict = {'n_estimators': 10, 'max_depth': 10}\n", + "params_dict = pipeline.get_params()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/24e41bb582554f42953fe6dc2b6b190e.\n", + "2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n" + ] + } + ], + "source": [ + "# Когда создаем новый эксперимент, то: \n", + "experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n", + "\n", + "# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n", + "#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", + "\n", + "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", + " # получаем уникальный идентификатор запуска эксперимента\n", + " run_id = run.info.run_id \n", + " mlflow.sklearn.log_model(pipeline, \n", + " artifact_path=\"models\",\n", + " signature=signature,\n", + " input_example=input_example,\n", + " pip_requirements=req_file\n", + " )\n", + " mlflow.log_metrics(metrics)\n", + " mlflow.log_artifact(art)\n", + " mlflow.log_params(params_dict)\n", + "\n", + "run = mlflow.get_run(run_id) \n", + "assert (run.info.status =='FINISHED')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Удаление runs, experiments\n", + "\n", + "Использовать осторожно" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", + "#mlflow.delete_experiment(experiment_id)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
run_idexperiment_idstatusartifact_uristart_timeend_timemetrics.maemetrics.mapemetrics.mseparams.preprocessor__cat__handle_unknown...params.model__max_samplesparams.preprocessor__transformersparams.model__monotonic_cstparams.model__warm_startparams.preprocessor__remaindertags.mlflow.usertags.mlflow.source.typetags.mlflow.runNametags.mlflow.source.nametags.mlflow.log-model.history
024e41bb582554f42953fe6dc2b6b190e1FINISHEDmlflow-artifacts:/1/24e41bb582554f42953fe6dc2b...2024-10-03 15:59:12.732000+00:002024-10-03 15:59:13.921000+00:001.276343e+060.3547141.745677e+14use_encoded_value...None[('num', StandardScaler(), ['geo_lat', 'geo_lo...NoneFalsedropandreyLOCALbaseline model/home/andrey/work/institute/MLE/assets/mlflow/...[{\"run_id\": \"24e41bb582554f42953fe6dc2b6b190e\"...
\n", + "

1 rows × 57 columns

\n", + "
" + ], + "text/plain": [ + " run_id experiment_id status \\\n", + "0 24e41bb582554f42953fe6dc2b6b190e 1 FINISHED \n", + "\n", + " artifact_uri \\\n", + "0 mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b... \n", + "\n", + " start_time end_time \\\n", + "0 2024-10-03 15:59:12.732000+00:00 2024-10-03 15:59:13.921000+00:00 \n", + "\n", + " metrics.mae metrics.mape metrics.mse \\\n", + "0 1.276343e+06 0.354714 1.745677e+14 \n", + "\n", + " params.preprocessor__cat__handle_unknown ... params.model__max_samples \\\n", + "0 use_encoded_value ... None \n", + "\n", + " params.preprocessor__transformers \\\n", + "0 [('num', StandardScaler(), ['geo_lat', 'geo_lo... \n", + "\n", + " params.model__monotonic_cst params.model__warm_start \\\n", + "0 None False \n", + "\n", + " params.preprocessor__remainder tags.mlflow.user tags.mlflow.source.type \\\n", + "0 drop andrey LOCAL \n", + "\n", + " tags.mlflow.runName tags.mlflow.source.name \\\n", + "0 baseline model /home/andrey/work/institute/MLE/assets/mlflow/... \n", + "\n", + " tags.mlflow.log-model.history \n", + "0 [{\"run_id\": \"24e41bb582554f42953fe6dc2b6b190e\"... \n", + "\n", + "[1 rows x 57 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "mlflow.search_runs(\n", + " #experiment_ids=[experiment_id],\n", + " experiment_names=[EXPERIMENT_NAME],\n", + " # filter_string='status = \"FAILED\"'\n", + " #filter_string='metrics.mae > 1'\n", + " \n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Автологирование\n", + "После включения будет срабатывать на каждом обучении модели (на методе fit()).\n", + "\n", + "Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/10/03 18:59:14 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.\n", + "2024/10/03 19:02:16 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\"\n", + "2024/10/03 19:02:40 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\"\n", + "2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run auto at: http://127.0.0.1:5000/#/experiments/1/runs/2ced09116c264623b89d8df7fe33cb10.\n", + "2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n" + ] + } + ], + "source": [ + "mlflow.sklearn.autolog()\n", + "\n", + "with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:\n", + " pipeline.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Отключаем автологирование\n", + "mlflow.sklearn.autolog(disable=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model #2\n", + "Обучим вторую \"маленькую\" модель\n" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessor',\n",
+       "                 ColumnTransformer(transformers=[('num', StandardScaler(),\n",
+       "                                                  ['geo_lat', 'geo_lon',\n",
+       "                                                   'level', 'levels', 'rooms',\n",
+       "                                                   'area', 'kitchen_area']),\n",
+       "                                                 ('cat',\n",
+       "                                                  OrdinalEncoder(handle_unknown='use_encoded_value',\n",
+       "                                                                 unknown_value=999),\n",
+       "                                                  ['region', 'building_type',\n",
+       "                                                   'object_type',\n",
+       "                                                   'floor_level'])])),\n",
+       "                ('model', RandomForestRegressor(max_depth=6, n_estimators=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessor',\n", + " ColumnTransformer(transformers=[('num', StandardScaler(),\n", + " ['geo_lat', 'geo_lon',\n", + " 'level', 'levels', 'rooms',\n", + " 'area', 'kitchen_area']),\n", + " ('cat',\n", + " OrdinalEncoder(handle_unknown='use_encoded_value',\n", + " unknown_value=999),\n", + " ['region', 'building_type',\n", + " 'object_type',\n", + " 'floor_level'])])),\n", + " ('model', RandomForestRegressor(max_depth=6, n_estimators=10))])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n", + " ('model', regressor2)])\n", + "\n", + "pipeline.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mae': np.float64(1536543.887713661),\n", + " 'mape': np.float64(0.42528854535519156),\n", + " 'mse': np.float64(210549541556055.7)}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = pipeline.predict(X_test) \n", + "metrics = {}\n", + "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", + "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", + "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", + "\n", + "metrics" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run smaller_model at: http://127.0.0.1:5000/#/experiments/1/runs/20f66bd4c3754a04b5e47ecc0f577e76.\n", + "2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n" + ] + } + ], + "source": [ + "# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй \"маленькой\" модели. \n", + "\n", + "\n", + "RUN_NAME = 'smaller_model'\n", + "# Когда создаем новый эксперимент, то: \n", + "#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n", + "\n", + "# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n", + "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", + "\n", + "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", + " # получаем уникальный идентификатор запуска эксперимента\n", + " run_id = run.info.run_id \n", + " mlflow.sklearn.log_model(pipeline, \n", + " artifact_path=\"models\",\n", + " signature=signature,\n", + " input_example=input_example,\n", + " pip_requirements=req_file\n", + " )\n", + " mlflow.log_metrics(metrics)\n", + " mlflow.log_artifact(art)\n", + " mlflow.log_params(pipeline.get_params())\n", + "\n", + "run = mlflow.get_run(run_id) \n", + "assert (run.info.status =='FINISHED')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run no_model at: http://127.0.0.1:5000/#/experiments/1/runs/6f6fe970eb74485d866e918b733f8f61.\n", + "2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n" + ] + } + ], + "source": [ + "# No model\n", + "# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA\n", + "\n", + "RUN_NAME = 'no_model'\n", + "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", + "\n", + "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", + " run_id = run.info.run_id \n", + " mlflow.log_artifact(art)\n", + "\n", + "\n", + "run = mlflow.get_run(run_id) \n", + "assert (run.info.status =='FINISHED')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Registered model 'estate_model_rf' already exists. Creating a new version of this model...\n", + "2024/10/03 19:03:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 1\n", + "Created version '1' of model 'estate_model_rf'.\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "run_id = '' # Указываем run id\n", + "mlflow.register_model(f\"runs:/{run_id}/models\", REGISTRY_MODEL_NAME)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Registered model 'estate_model_rf' already exists. Creating a new version of this model...\n", + "2024/10/03 19:03:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 2\n", + "Created version '2' of model 'estate_model_rf'.\n", + "2024/10/03 19:03:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run register_at_run at: http://127.0.0.1:5000/#/experiments/1/runs/ed64a91759ed43c99329810d066ea95a.\n", + "2024/10/03 19:03:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n" + ] + } + ], + "source": [ + "# Можно регистрировать сразу при создании прогона\n", + "\n", + "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", + "\n", + "with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:\n", + " # получаем уникальный идентификатор запуска эксперимента\n", + " run_id = run.info.run_id \n", + " mlflow.sklearn.log_model(pipeline, \n", + " artifact_path=\"models\",\n", + " signature=signature,\n", + " input_example=input_example,\n", + " pip_requirements=req_file,\n", + " registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем\n", + " )\n", + " mlflow.log_metrics(metrics)\n", + " mlflow.log_artifact(art)\n", + " mlflow.log_params(pipeline.get_params())\n", + "\n", + "run = mlflow.get_run(run_id) \n", + "assert (run.info.status =='FINISHED')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "], name='estate_model_rf', tags={}>" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Можно найти зарегистрированные модели\n", + "model_reg = mlflow.search_registered_models()\n", + "model_reg[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "model_name = REGISTRY_MODEL_NAME\n", + "model_version = 1\n", + "\n", + "model_loaded = mlflow.sklearn.load_model(model_uri=f\"models:/{model_name}/{model_version}\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([3438055.97819847])" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model_loaded.predict(X_test.iloc[0:1])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "np.int64(3062900)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_test.iloc[0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv_labs_proj", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lectures/lec4-mlflow.odp b/lectures/lec4-mlflow.odp index 07b329c..2c90b78 100644 Binary files a/lectures/lec4-mlflow.odp and b/lectures/lec4-mlflow.odp differ diff --git a/lectures/lec4-mlflow.pptx b/lectures/lec4-mlflow.pptx index 09dcdfc..c2a7242 100644 Binary files a/lectures/lec4-mlflow.pptx and b/lectures/lec4-mlflow.pptx differ