Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

2011 строки
89 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import mlflow\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"import pandas as pd\n",
"import numpy\n",
"\n",
"from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n",
"\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.linear_model import LinearRegression\n",
"\n",
"from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Index: 539355 entries, 1979096 to 5189500\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 price 539355 non-null int64 \n",
" 1 geo_lat 539355 non-null float32 \n",
" 2 geo_lon 539355 non-null float32 \n",
" 3 region 539355 non-null category\n",
" 4 building_type 539355 non-null category\n",
" 5 level 539355 non-null int8 \n",
" 6 levels 539355 non-null int8 \n",
" 7 rooms 539355 non-null int8 \n",
" 8 area 539355 non-null float16 \n",
" 9 kitchen_area 539355 non-null float16 \n",
" 10 object_type 539355 non-null category\n",
" 11 floor_level 539355 non-null object \n",
"dtypes: category(3), float16(2), float32(2), int64(1), int8(3), object(1)\n",
"memory usage: 21.6+ MB\n"
]
}
],
"source": [
"df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns={'price': 'target'})"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/pandas/io/formats/format.py:1458: RuntimeWarning: overflow encountered in cast\n",
" has_large_values = (abs_vals > 1e6).any()\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>target</th>\n",
" <th>geo_lat</th>\n",
" <th>geo_lon</th>\n",
" <th>region</th>\n",
" <th>building_type</th>\n",
" <th>level</th>\n",
" <th>levels</th>\n",
" <th>rooms</th>\n",
" <th>area</th>\n",
" <th>kitchen_area</th>\n",
" <th>object_type</th>\n",
" <th>floor_level</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1979096</th>\n",
" <td>1300000</td>\n",
" <td>52.821098</td>\n",
" <td>83.113037</td>\n",
" <td>6817</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>66.50000</td>\n",
" <td>10.000000</td>\n",
" <td>1</td>\n",
" <td>first</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1833303</th>\n",
" <td>8800000</td>\n",
" <td>55.707539</td>\n",
" <td>37.467068</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>15</td>\n",
" <td>16</td>\n",
" <td>2</td>\n",
" <td>46.00000</td>\n",
" <td>7.000000</td>\n",
" <td>1</td>\n",
" <td>hi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1494335</th>\n",
" <td>1958000</td>\n",
" <td>54.988400</td>\n",
" <td>82.783691</td>\n",
" <td>9654</td>\n",
" <td>2</td>\n",
" <td>13</td>\n",
" <td>17</td>\n",
" <td>1</td>\n",
" <td>36.50000</td>\n",
" <td>11.960938</td>\n",
" <td>11</td>\n",
" <td>hi</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2747476</th>\n",
" <td>1461600</td>\n",
" <td>53.298553</td>\n",
" <td>50.326382</td>\n",
" <td>3106</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>32.59375</td>\n",
" <td>9.601562</td>\n",
" <td>11</td>\n",
" <td>last</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5027275</th>\n",
" <td>3000000</td>\n",
" <td>42.897934</td>\n",
" <td>47.624825</td>\n",
" <td>4007</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>10</td>\n",
" <td>2</td>\n",
" <td>70.00000</td>\n",
" <td>12.000000</td>\n",
" <td>11</td>\n",
" <td>mid</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2476626</th>\n",
" <td>1490000</td>\n",
" <td>54.943806</td>\n",
" <td>82.957870</td>\n",
" <td>9654</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>10</td>\n",
" <td>1</td>\n",
" <td>48.06250</td>\n",
" <td>14.000000</td>\n",
" <td>11</td>\n",
" <td>low</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1487454</th>\n",
" <td>19000000</td>\n",
" <td>55.772240</td>\n",
" <td>37.731136</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>100.00000</td>\n",
" <td>13.000000</td>\n",
" <td>1</td>\n",
" <td>mid</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2772844</th>\n",
" <td>1200000</td>\n",
" <td>54.474590</td>\n",
" <td>53.531807</td>\n",
" <td>2722</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>9</td>\n",
" <td>1</td>\n",
" <td>32.09375</td>\n",
" <td>7.000000</td>\n",
" <td>1</td>\n",
" <td>mid</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3982304</th>\n",
" <td>2300000</td>\n",
" <td>55.378265</td>\n",
" <td>39.053310</td>\n",
" <td>81</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>49.00000</td>\n",
" <td>9.000000</td>\n",
" <td>1</td>\n",
" <td>first</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5189500</th>\n",
" <td>9157730</td>\n",
" <td>55.542957</td>\n",
" <td>37.479919</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>8</td>\n",
" <td>17</td>\n",
" <td>2</td>\n",
" <td>52.31250</td>\n",
" <td>17.593750</td>\n",
" <td>11</td>\n",
" <td>mid</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>539355 rows × 12 columns</p>\n",
"</div>"
],
"text/plain": [
" target geo_lat geo_lon region building_type level levels \\\n",
"1979096 1300000 52.821098 83.113037 6817 1 1 1 \n",
"1833303 8800000 55.707539 37.467068 3 1 15 16 \n",
"1494335 1958000 54.988400 82.783691 9654 2 13 17 \n",
"2747476 1461600 53.298553 50.326382 3106 3 5 5 \n",
"5027275 3000000 42.897934 47.624825 4007 3 4 10 \n",
"... ... ... ... ... ... ... ... \n",
"2476626 1490000 54.943806 82.957870 9654 1 2 10 \n",
"1487454 19000000 55.772240 37.731136 3 3 4 12 \n",
"2772844 1200000 54.474590 53.531807 2722 1 5 9 \n",
"3982304 2300000 55.378265 39.053310 81 1 1 5 \n",
"5189500 9157730 55.542957 37.479919 3 1 8 17 \n",
"\n",
" rooms area kitchen_area object_type floor_level \n",
"1979096 3 66.50000 10.000000 1 first \n",
"1833303 2 46.00000 7.000000 1 hi \n",
"1494335 1 36.50000 11.960938 11 hi \n",
"2747476 1 32.59375 9.601562 11 last \n",
"5027275 2 70.00000 12.000000 11 mid \n",
"... ... ... ... ... ... \n",
"2476626 1 48.06250 14.000000 11 low \n",
"1487454 3 100.00000 13.000000 1 mid \n",
"2772844 1 32.09375 7.000000 1 mid \n",
"3982304 2 49.00000 9.000000 1 first \n",
"5189500 2 52.31250 17.593750 11 mid \n",
"\n",
"[539355 rows x 12 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['region', 'building_type', 'object_type', 'floor_level']"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()\n",
"cat_features"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"num_features = X_train.select_dtypes(include=['number']).columns.to_list()\n",
"num_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"s_scaler = StandardScaler()\n",
"l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999) # unknown_value нужно выбирать с умом\n",
"regressor = RandomForestRegressor(n_estimators=20, max_depth=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Column transformer"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Для удобной работы со столбцами\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', s_scaler, num_features), # преобразования для числовых признаков\n",
" ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
" ],\n",
" remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-1 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-1 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-1 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-1 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-1 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-1 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-1 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-1 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-1 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-1 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),\n",
" [&#x27;geo_lat&#x27;, &#x27;geo_lon&#x27;,\n",
" &#x27;level&#x27;, &#x27;levels&#x27;, &#x27;rooms&#x27;,\n",
" &#x27;area&#x27;, &#x27;kitchen_area&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" OrdinalEncoder(handle_unknown=&#x27;use_encoded_value&#x27;,\n",
" unknown_value=999),\n",
" [&#x27;region&#x27;, &#x27;building_type&#x27;,\n",
" &#x27;object_type&#x27;,\n",
" &#x27;floor_level&#x27;])])),\n",
" (&#x27;model&#x27;,\n",
" RandomForestRegressor(max_depth=10, n_estimators=20))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" ><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),\n",
" [&#x27;geo_lat&#x27;, &#x27;geo_lon&#x27;,\n",
" &#x27;level&#x27;, &#x27;levels&#x27;, &#x27;rooms&#x27;,\n",
" &#x27;area&#x27;, &#x27;kitchen_area&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" OrdinalEncoder(handle_unknown=&#x27;use_encoded_value&#x27;,\n",
" unknown_value=999),\n",
" [&#x27;region&#x27;, &#x27;building_type&#x27;,\n",
" &#x27;object_type&#x27;,\n",
" &#x27;floor_level&#x27;])])),\n",
" (&#x27;model&#x27;,\n",
" RandomForestRegressor(max_depth=10, n_estimators=20))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-2\" type=\"checkbox\" ><label for=\"sk-estimator-id-2\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),\n",
" [&#x27;geo_lat&#x27;, &#x27;geo_lon&#x27;, &#x27;level&#x27;, &#x27;levels&#x27;,\n",
" &#x27;rooms&#x27;, &#x27;area&#x27;, &#x27;kitchen_area&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" OrdinalEncoder(handle_unknown=&#x27;use_encoded_value&#x27;,\n",
" unknown_value=999),\n",
" [&#x27;region&#x27;, &#x27;building_type&#x27;, &#x27;object_type&#x27;,\n",
" &#x27;floor_level&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-3\" type=\"checkbox\" ><label for=\"sk-estimator-id-3\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">num</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;geo_lat&#x27;, &#x27;geo_lon&#x27;, &#x27;level&#x27;, &#x27;levels&#x27;, &#x27;rooms&#x27;, &#x27;area&#x27;, &#x27;kitchen_area&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-4\" type=\"checkbox\" ><label for=\"sk-estimator-id-4\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;StandardScaler<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>StandardScaler()</pre></div> </div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-5\" type=\"checkbox\" ><label for=\"sk-estimator-id-5\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">cat</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;region&#x27;, &#x27;building_type&#x27;, &#x27;object_type&#x27;, &#x27;floor_level&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-6\" type=\"checkbox\" ><label for=\"sk-estimator-id-6\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;OrdinalEncoder<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.OrdinalEncoder.html\">?<span>Documentation for OrdinalEncoder</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>OrdinalEncoder(handle_unknown=&#x27;use_encoded_value&#x27;, unknown_value=999)</pre></div> </div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-7\" type=\"checkbox\" ><label for=\"sk-estimator-id-7\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;RandomForestRegressor<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestRegressor.html\">?<span>Documentation for RandomForestRegressor</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestRegressor(max_depth=10, n_estimators=20)</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num', StandardScaler(),\n",
" ['geo_lat', 'geo_lon',\n",
" 'level', 'levels', 'rooms',\n",
" 'area', 'kitchen_area']),\n",
" ('cat',\n",
" OrdinalEncoder(handle_unknown='use_encoded_value',\n",
" unknown_value=999),\n",
" ['region', 'building_type',\n",
" 'object_type',\n",
" 'floor_level'])])),\n",
" ('model',\n",
" RandomForestRegressor(max_depth=10, n_estimators=20))])"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n",
" ('model', regressor)])\n",
"\n",
"pipeline.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'mae': np.float64(1276343.108894747),\n",
" 'mape': np.float64(0.35471390164231303),\n",
" 'mse': np.float64(174567675833231.12)}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions = pipeline.predict(X_test) \n",
"\n",
"metrics = {}\n",
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
"\n",
"metrics"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Работаем с MLflow локально\n",
"TRACKING_SERVER_HOST = \"127.0.0.1\"\n",
"TRACKING_SERVER_PORT = 5000\n",
"\n",
"registry_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n",
"tracking_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n",
"\n",
"mlflow.set_tracking_uri(tracking_uri) \n",
"mlflow.set_registry_uri(registry_uri) \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться\n",
"EXPERIMENT_NAME = \"estate_project\"\n",
"RUN_NAME = \"baseline model\"\n",
"REGISTRY_MODEL_NAME = \"estate_model_rf\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Логируем вручную"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\n",
" warnings.warn(\n"
]
}
],
"source": [
"# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их\n",
"from mlflow.models import infer_signature\n",
"\n",
"signature = infer_signature(model_input = X_train.head(5))\n",
"input_example = X_train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# Будем логировать requirements и артефакт - текстовый файл\n",
"req_file = 'requirements.txt'\n",
"art = 'comment.txt'"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели\n",
"#params_dict = {'n_estimators': 10, 'max_depth': 10}\n",
"params_dict = pipeline.get_params()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/24e41bb582554f42953fe6dc2b6b190e.\n",
"2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n"
]
}
],
"source": [
"# Когда создаем новый эксперимент, то: \n",
"experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n",
"#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
" # получаем уникальный идентификатор запуска эксперимента\n",
" run_id = run.info.run_id \n",
" mlflow.sklearn.log_model(pipeline, \n",
" artifact_path=\"models\",\n",
" signature=signature,\n",
" input_example=input_example,\n",
" pip_requirements=req_file\n",
" )\n",
" mlflow.log_metrics(metrics)\n",
" mlflow.log_artifact(art)\n",
" mlflow.log_params(params_dict)\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Удаление runs, experiments\n",
"\n",
"Использовать осторожно"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"#mlflow.delete_experiment(experiment_id)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>run_id</th>\n",
" <th>experiment_id</th>\n",
" <th>status</th>\n",
" <th>artifact_uri</th>\n",
" <th>start_time</th>\n",
" <th>end_time</th>\n",
" <th>metrics.mae</th>\n",
" <th>metrics.mape</th>\n",
" <th>metrics.mse</th>\n",
" <th>params.preprocessor__cat__handle_unknown</th>\n",
" <th>...</th>\n",
" <th>params.model__max_samples</th>\n",
" <th>params.preprocessor__transformers</th>\n",
" <th>params.model__monotonic_cst</th>\n",
" <th>params.model__warm_start</th>\n",
" <th>params.preprocessor__remainder</th>\n",
" <th>tags.mlflow.user</th>\n",
" <th>tags.mlflow.source.type</th>\n",
" <th>tags.mlflow.runName</th>\n",
" <th>tags.mlflow.source.name</th>\n",
" <th>tags.mlflow.log-model.history</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>24e41bb582554f42953fe6dc2b6b190e</td>\n",
" <td>1</td>\n",
" <td>FINISHED</td>\n",
" <td>mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b...</td>\n",
" <td>2024-10-03 15:59:12.732000+00:00</td>\n",
" <td>2024-10-03 15:59:13.921000+00:00</td>\n",
" <td>1.276343e+06</td>\n",
" <td>0.354714</td>\n",
" <td>1.745677e+14</td>\n",
" <td>use_encoded_value</td>\n",
" <td>...</td>\n",
" <td>None</td>\n",
" <td>[('num', StandardScaler(), ['geo_lat', 'geo_lo...</td>\n",
" <td>None</td>\n",
" <td>False</td>\n",
" <td>drop</td>\n",
" <td>andrey</td>\n",
" <td>LOCAL</td>\n",
" <td>baseline model</td>\n",
" <td>/home/andrey/work/institute/MLE/assets/mlflow/...</td>\n",
" <td>[{\"run_id\": \"24e41bb582554f42953fe6dc2b6b190e\"...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 57 columns</p>\n",
"</div>"
],
"text/plain": [
" run_id experiment_id status \\\n",
"0 24e41bb582554f42953fe6dc2b6b190e 1 FINISHED \n",
"\n",
" artifact_uri \\\n",
"0 mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b... \n",
"\n",
" start_time end_time \\\n",
"0 2024-10-03 15:59:12.732000+00:00 2024-10-03 15:59:13.921000+00:00 \n",
"\n",
" metrics.mae metrics.mape metrics.mse \\\n",
"0 1.276343e+06 0.354714 1.745677e+14 \n",
"\n",
" params.preprocessor__cat__handle_unknown ... params.model__max_samples \\\n",
"0 use_encoded_value ... None \n",
"\n",
" params.preprocessor__transformers \\\n",
"0 [('num', StandardScaler(), ['geo_lat', 'geo_lo... \n",
"\n",
" params.model__monotonic_cst params.model__warm_start \\\n",
"0 None False \n",
"\n",
" params.preprocessor__remainder tags.mlflow.user tags.mlflow.source.type \\\n",
"0 drop andrey LOCAL \n",
"\n",
" tags.mlflow.runName tags.mlflow.source.name \\\n",
"0 baseline model /home/andrey/work/institute/MLE/assets/mlflow/... \n",
"\n",
" tags.mlflow.log-model.history \n",
"0 [{\"run_id\": \"24e41bb582554f42953fe6dc2b6b190e\"... \n",
"\n",
"[1 rows x 57 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mlflow.search_runs(\n",
" #experiment_ids=[experiment_id],\n",
" experiment_names=[EXPERIMENT_NAME],\n",
" # filter_string='status = \"FAILED\"'\n",
" #filter_string='metrics.mae > 1'\n",
" \n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"\n",
"#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Автологирование\n",
"После включения будет срабатывать на каждом обучении модели (на методе fit()).\n",
"\n",
"Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024/10/03 18:59:14 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.\n",
"2024/10/03 19:02:16 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
"2024/10/03 19:02:40 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: \"/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.\"\n",
"2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run auto at: http://127.0.0.1:5000/#/experiments/1/runs/2ced09116c264623b89d8df7fe33cb10.\n",
"2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n"
]
}
],
"source": [
"mlflow.sklearn.autolog()\n",
"\n",
"with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:\n",
" pipeline.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# Отключаем автологирование\n",
"mlflow.sklearn.autolog(disable=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model #2\n",
"Обучим вторую \"маленькую\" модель\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style>#sk-container-id-3 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-3 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-3 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-3 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-3 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-3 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-3 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-3 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-3 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-3 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-3 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-3 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-3 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-3 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-3 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-3 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-3 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-3 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-3 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-3 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-3 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-3 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-3 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-3 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-3\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),\n",
" [&#x27;geo_lat&#x27;, &#x27;geo_lon&#x27;,\n",
" &#x27;level&#x27;, &#x27;levels&#x27;, &#x27;rooms&#x27;,\n",
" &#x27;area&#x27;, &#x27;kitchen_area&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" OrdinalEncoder(handle_unknown=&#x27;use_encoded_value&#x27;,\n",
" unknown_value=999),\n",
" [&#x27;region&#x27;, &#x27;building_type&#x27;,\n",
" &#x27;object_type&#x27;,\n",
" &#x27;floor_level&#x27;])])),\n",
" (&#x27;model&#x27;, RandomForestRegressor(max_depth=6, n_estimators=10))])</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-15\" type=\"checkbox\" ><label for=\"sk-estimator-id-15\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;&nbsp;Pipeline<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.pipeline.Pipeline.html\">?<span>Documentation for Pipeline</span></a><span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>Pipeline(steps=[(&#x27;preprocessor&#x27;,\n",
" ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),\n",
" [&#x27;geo_lat&#x27;, &#x27;geo_lon&#x27;,\n",
" &#x27;level&#x27;, &#x27;levels&#x27;, &#x27;rooms&#x27;,\n",
" &#x27;area&#x27;, &#x27;kitchen_area&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" OrdinalEncoder(handle_unknown=&#x27;use_encoded_value&#x27;,\n",
" unknown_value=999),\n",
" [&#x27;region&#x27;, &#x27;building_type&#x27;,\n",
" &#x27;object_type&#x27;,\n",
" &#x27;floor_level&#x27;])])),\n",
" (&#x27;model&#x27;, RandomForestRegressor(max_depth=6, n_estimators=10))])</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item sk-dashed-wrapped\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-16\" type=\"checkbox\" ><label for=\"sk-estimator-id-16\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;preprocessor: ColumnTransformer<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.compose.ColumnTransformer.html\">?<span>Documentation for preprocessor: ColumnTransformer</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>ColumnTransformer(transformers=[(&#x27;num&#x27;, StandardScaler(),\n",
" [&#x27;geo_lat&#x27;, &#x27;geo_lon&#x27;, &#x27;level&#x27;, &#x27;levels&#x27;,\n",
" &#x27;rooms&#x27;, &#x27;area&#x27;, &#x27;kitchen_area&#x27;]),\n",
" (&#x27;cat&#x27;,\n",
" OrdinalEncoder(handle_unknown=&#x27;use_encoded_value&#x27;,\n",
" unknown_value=999),\n",
" [&#x27;region&#x27;, &#x27;building_type&#x27;, &#x27;object_type&#x27;,\n",
" &#x27;floor_level&#x27;])])</pre></div> </div></div><div class=\"sk-parallel\"><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-17\" type=\"checkbox\" ><label for=\"sk-estimator-id-17\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">num</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;geo_lat&#x27;, &#x27;geo_lon&#x27;, &#x27;level&#x27;, &#x27;levels&#x27;, &#x27;rooms&#x27;, &#x27;area&#x27;, &#x27;kitchen_area&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-18\" type=\"checkbox\" ><label for=\"sk-estimator-id-18\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;StandardScaler<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.StandardScaler.html\">?<span>Documentation for StandardScaler</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>StandardScaler()</pre></div> </div></div></div></div></div><div class=\"sk-parallel-item\"><div class=\"sk-item\"><div class=\"sk-label-container\"><div class=\"sk-label fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-19\" type=\"checkbox\" ><label for=\"sk-estimator-id-19\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">cat</label><div class=\"sk-toggleable__content fitted\"><pre>[&#x27;region&#x27;, &#x27;building_type&#x27;, &#x27;object_type&#x27;, &#x27;floor_level&#x27;]</pre></div> </div></div><div class=\"sk-serial\"><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-20\" type=\"checkbox\" ><label for=\"sk-estimator-id-20\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;OrdinalEncoder<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.OrdinalEncoder.html\">?<span>Documentation for OrdinalEncoder</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>OrdinalEncoder(handle_unknown=&#x27;use_encoded_value&#x27;, unknown_value=999)</pre></div> </div></div></div></div></div></div></div><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-21\" type=\"checkbox\" ><label for=\"sk-estimator-id-21\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;RandomForestRegressor<a class=\"sk-estimator-doc-link fitted\" rel=\"noreferrer\" target=\"_blank\" href=\"https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestRegressor.html\">?<span>Documentation for RandomForestRegressor</span></a></label><div class=\"sk-toggleable__content fitted\"><pre>RandomForestRegressor(max_depth=6, n_estimators=10)</pre></div> </div></div></div></div></div></div>"
],
"text/plain": [
"Pipeline(steps=[('preprocessor',\n",
" ColumnTransformer(transformers=[('num', StandardScaler(),\n",
" ['geo_lat', 'geo_lon',\n",
" 'level', 'levels', 'rooms',\n",
" 'area', 'kitchen_area']),\n",
" ('cat',\n",
" OrdinalEncoder(handle_unknown='use_encoded_value',\n",
" unknown_value=999),\n",
" ['region', 'building_type',\n",
" 'object_type',\n",
" 'floor_level'])])),\n",
" ('model', RandomForestRegressor(max_depth=6, n_estimators=10))])"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n",
" ('model', regressor2)])\n",
"\n",
"pipeline.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'mae': np.float64(1536543.887713661),\n",
" 'mape': np.float64(0.42528854535519156),\n",
" 'mse': np.float64(210549541556055.7)}"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictions = pipeline.predict(X_test) \n",
"metrics = {}\n",
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
"\n",
"metrics"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run smaller_model at: http://127.0.0.1:5000/#/experiments/1/runs/20f66bd4c3754a04b5e47ecc0f577e76.\n",
"2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n"
]
}
],
"source": [
"# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй \"маленькой\" модели. \n",
"\n",
"\n",
"RUN_NAME = 'smaller_model'\n",
"# Когда создаем новый эксперимент, то: \n",
"#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n",
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
" # получаем уникальный идентификатор запуска эксперимента\n",
" run_id = run.info.run_id \n",
" mlflow.sklearn.log_model(pipeline, \n",
" artifact_path=\"models\",\n",
" signature=signature,\n",
" input_example=input_example,\n",
" pip_requirements=req_file\n",
" )\n",
" mlflow.log_metrics(metrics)\n",
" mlflow.log_artifact(art)\n",
" mlflow.log_params(pipeline.get_params())\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run no_model at: http://127.0.0.1:5000/#/experiments/1/runs/6f6fe970eb74485d866e918b733f8f61.\n",
"2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n"
]
}
],
"source": [
"# No model\n",
"# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA\n",
"\n",
"RUN_NAME = 'no_model'\n",
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
" run_id = run.info.run_id \n",
" mlflow.log_artifact(art)\n",
"\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Registered model 'estate_model_rf' already exists. Creating a new version of this model...\n",
"2024/10/03 19:03:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 1\n",
"Created version '1' of model 'estate_model_rf'.\n"
]
},
{
"data": {
"text/plain": [
"<ModelVersion: aliases=[], creation_timestamp=1727971394174, current_stage='None', description='', last_updated_timestamp=1727971394174, name='estate_model_rf', run_id='24e41bb582554f42953fe6dc2b6b190e', run_link='', source='mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b6b190e/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"run_id = '' # Указываем run id\n",
"mlflow.register_model(f\"runs:/{run_id}/models\", REGISTRY_MODEL_NAME)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Registered model 'estate_model_rf' already exists. Creating a new version of this model...\n",
"2024/10/03 19:03:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 2\n",
"Created version '2' of model 'estate_model_rf'.\n",
"2024/10/03 19:03:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run register_at_run at: http://127.0.0.1:5000/#/experiments/1/runs/ed64a91759ed43c99329810d066ea95a.\n",
"2024/10/03 19:03:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n"
]
}
],
"source": [
"# Можно регистрировать сразу при создании прогона\n",
"\n",
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:\n",
" # получаем уникальный идентификатор запуска эксперимента\n",
" run_id = run.info.run_id \n",
" mlflow.sklearn.log_model(pipeline, \n",
" artifact_path=\"models\",\n",
" signature=signature,\n",
" input_example=input_example,\n",
" pip_requirements=req_file,\n",
" registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем\n",
" )\n",
" mlflow.log_metrics(metrics)\n",
" mlflow.log_artifact(art)\n",
" mlflow.log_params(pipeline.get_params())\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<RegisteredModel: aliases={}, creation_timestamp=1727971371173, description='', last_updated_timestamp=1727971394354, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1727971394354, current_stage='None', description='', last_updated_timestamp=1727971394354, name='estate_model_rf', run_id='ed64a91759ed43c99329810d066ea95a', run_link='', source='mlflow-artifacts:/1/ed64a91759ed43c99329810d066ea95a/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>], name='estate_model_rf', tags={}>"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Можно найти зарегистрированные модели\n",
"model_reg = mlflow.search_registered_models()\n",
"model_reg[0]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"\n",
"model_name = REGISTRY_MODEL_NAME\n",
"model_version = 1\n",
"\n",
"model_loaded = mlflow.sklearn.load_model(model_uri=f\"models:/{model_name}/{model_version}\")\n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([3438055.97819847])"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model_loaded.predict(X_test.iloc[0:1])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"np.int64(3062900)"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y_test.iloc[0]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv_labs_proj",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}