Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
89 KiB
89 KiB
import os
import mlflow
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
= pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции
df df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 539355 entries, 1979096 to 5189500
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 price 539355 non-null int64
1 geo_lat 539355 non-null float32
2 geo_lon 539355 non-null float32
3 region 539355 non-null category
4 building_type 539355 non-null category
5 level 539355 non-null int8
6 levels 539355 non-null int8
7 rooms 539355 non-null int8
8 area 539355 non-null float16
9 kitchen_area 539355 non-null float16
10 object_type 539355 non-null category
11 floor_level 539355 non-null object
dtypes: category(3), float16(2), float32(2), int64(1), int8(3), object(1)
memory usage: 21.6+ MB
= df.rename(columns={'price': 'target'}) df
df
/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/pandas/io/formats/format.py:1458: RuntimeWarning: overflow encountered in cast
has_large_values = (abs_vals > 1e6).any()
target | geo_lat | geo_lon | region | building_type | level | levels | rooms | area | kitchen_area | object_type | floor_level | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1979096 | 1300000 | 52.821098 | 83.113037 | 6817 | 1 | 1 | 1 | 3 | 66.50000 | 10.000000 | 1 | first |
1833303 | 8800000 | 55.707539 | 37.467068 | 3 | 1 | 15 | 16 | 2 | 46.00000 | 7.000000 | 1 | hi |
1494335 | 1958000 | 54.988400 | 82.783691 | 9654 | 2 | 13 | 17 | 1 | 36.50000 | 11.960938 | 11 | hi |
2747476 | 1461600 | 53.298553 | 50.326382 | 3106 | 3 | 5 | 5 | 1 | 32.59375 | 9.601562 | 11 | last |
5027275 | 3000000 | 42.897934 | 47.624825 | 4007 | 3 | 4 | 10 | 2 | 70.00000 | 12.000000 | 11 | mid |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2476626 | 1490000 | 54.943806 | 82.957870 | 9654 | 1 | 2 | 10 | 1 | 48.06250 | 14.000000 | 11 | low |
1487454 | 19000000 | 55.772240 | 37.731136 | 3 | 3 | 4 | 12 | 3 | 100.00000 | 13.000000 | 1 | mid |
2772844 | 1200000 | 54.474590 | 53.531807 | 2722 | 1 | 5 | 9 | 1 | 32.09375 | 7.000000 | 1 | mid |
3982304 | 2300000 | 55.378265 | 39.053310 | 81 | 1 | 1 | 5 | 2 | 49.00000 | 9.000000 | 1 | first |
5189500 | 9157730 | 55.542957 | 37.479919 | 3 | 1 | 8 | 17 | 2 | 52.31250 | 17.593750 | 11 | mid |
539355 rows × 12 columns
= train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2) X_train, X_test, y_train, y_test
= X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features cat_features
['region', 'building_type', 'object_type', 'floor_level']
= X_train.select_dtypes(include=['number']).columns.to_list()
num_features num_features
['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']
https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования
= StandardScaler()
s_scaler = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999) # unknown_value нужно выбирать с умом
l_encoder = RandomForestRegressor(n_estimators=20, max_depth=10) regressor
Column transformer
# Для удобной работы со столбцами
= ColumnTransformer(
preprocessor =[
transformers'num', s_scaler, num_features), # преобразования для числовых признаков
('cat', l_encoder, cat_features), # преобразования для категориальных признаков
(
],='drop' ) # Удаляем столбцы, которые не затронуты преобразования remainder
= Pipeline(steps=[('preprocessor', preprocessor),
pipeline 'model', regressor)])
(
pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', StandardScaler(), ['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']), ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999), ['region', 'building_type', 'object_type', 'floor_level'])])), ('model', RandomForestRegressor(max_depth=10, n_estimators=20))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', StandardScaler(), ['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']), ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999), ['region', 'building_type', 'object_type', 'floor_level'])])), ('model', RandomForestRegressor(max_depth=10, n_estimators=20))])
ColumnTransformer(transformers=[('num', StandardScaler(), ['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']), ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999), ['region', 'building_type', 'object_type', 'floor_level'])])
['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']
StandardScaler()
['region', 'building_type', 'object_type', 'floor_level']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999)
RandomForestRegressor(max_depth=10, n_estimators=20)
= pipeline.predict(X_test)
predictions
= {}
metrics "mae"] = mean_absolute_error(y_test, predictions)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)
metrics[
metrics
{'mae': np.float64(1276343.108894747),
'mape': np.float64(0.35471390164231303),
'mse': np.float64(174567675833231.12)}
# Работаем с MLflow локально
= "127.0.0.1"
TRACKING_SERVER_HOST = 5000
TRACKING_SERVER_PORT
= f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_registry_uri(registry_uri)
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
= "estate_project"
EXPERIMENT_NAME = "baseline model"
RUN_NAME = "estate_model_rf" REGISTRY_MODEL_NAME
Логируем вручную
# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature
= infer_signature(model_input = X_train.head(5))
signature = X_train.head(5) input_example
/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
warnings.warn(
# Будем логировать requirements и артефакт - текстовый файл
= 'requirements.txt'
req_file = 'comment.txt' art
# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
#params_dict = {'n_estimators': 10, 'max_depth': 10}
= pipeline.get_params() params_dict
# Когда создаем новый эксперимент, то:
= mlflow.create_experiment(EXPERIMENT_NAME)
experiment_id
# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(params_dict)
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/24e41bb582554f42953fe6dc2b6b190e.
2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
Удаление runs, experiments
Использовать осторожно
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id #mlflow.delete_experiment(experiment_id)
mlflow.search_runs(#experiment_ids=[experiment_id],
=[EXPERIMENT_NAME],
experiment_names# filter_string='status = "FAILED"'
#filter_string='metrics.mae > 1'
)
run_id | experiment_id | status | artifact_uri | start_time | end_time | metrics.mae | metrics.mape | metrics.mse | params.preprocessor__cat__handle_unknown | ... | params.model__max_samples | params.preprocessor__transformers | params.model__monotonic_cst | params.model__warm_start | params.preprocessor__remainder | tags.mlflow.user | tags.mlflow.source.type | tags.mlflow.runName | tags.mlflow.source.name | tags.mlflow.log-model.history | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 24e41bb582554f42953fe6dc2b6b190e | 1 | FINISHED | mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b... | 2024-10-03 15:59:12.732000+00:00 | 2024-10-03 15:59:13.921000+00:00 | 1.276343e+06 | 0.354714 | 1.745677e+14 | use_encoded_value | ... | None | [('num', StandardScaler(), ['geo_lat', 'geo_lo... | None | False | drop | andrey | LOCAL | baseline model | /home/andrey/work/institute/MLE/assets/mlflow/... | [{"run_id": "24e41bb582554f42953fe6dc2b6b190e"... |
1 rows × 57 columns
#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')
Автологирование
После включения будет срабатывать на каждом обучении модели (на методе fit()).
Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную
mlflow.sklearn.autolog()
with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:
pipeline.fit(X_train, y_train)
2024/10/03 18:59:14 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.
2024/10/03 19:02:16 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/10/03 19:02:40 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run auto at: http://127.0.0.1:5000/#/experiments/1/runs/2ced09116c264623b89d8df7fe33cb10.
2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
# Отключаем автологирование
=True) mlflow.sklearn.autolog(disable
Model #2
Обучим вторую "маленькую" модель
= RandomForestRegressor(n_estimators=10, max_depth=6) regressor2
= Pipeline(steps=[('preprocessor', preprocessor),
pipeline 'model', regressor2)])
(
pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', StandardScaler(), ['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']), ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999), ['region', 'building_type', 'object_type', 'floor_level'])])), ('model', RandomForestRegressor(max_depth=6, n_estimators=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor', ColumnTransformer(transformers=[('num', StandardScaler(), ['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']), ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999), ['region', 'building_type', 'object_type', 'floor_level'])])), ('model', RandomForestRegressor(max_depth=6, n_estimators=10))])
ColumnTransformer(transformers=[('num', StandardScaler(), ['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']), ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999), ['region', 'building_type', 'object_type', 'floor_level'])])
['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']
StandardScaler()
['region', 'building_type', 'object_type', 'floor_level']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999)
RandomForestRegressor(max_depth=6, n_estimators=10)
= pipeline.predict(X_test)
predictions = {}
metrics "mae"] = mean_absolute_error(y_test, predictions)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)
metrics[
metrics
{'mae': np.float64(1536543.887713661),
'mape': np.float64(0.42528854535519156),
'mse': np.float64(210549541556055.7)}
# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй "маленькой" модели.
= 'smaller_model'
RUN_NAME # Когда создаем новый эксперимент, то:
#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)
# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(pipeline.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run smaller_model at: http://127.0.0.1:5000/#/experiments/1/runs/20f66bd4c3754a04b5e47ecc0f577e76.
2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
# No model
# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA
= 'no_model'
RUN_NAME = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
= run.info.run_id
run_id
mlflow.log_artifact(art)
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run no_model at: http://127.0.0.1:5000/#/experiments/1/runs/6f6fe970eb74485d866e918b733f8f61.
2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
= '' # Указываем run id
run_id f"runs:/{run_id}/models", REGISTRY_MODEL_NAME) mlflow.register_model(
Registered model 'estate_model_rf' already exists. Creating a new version of this model...
2024/10/03 19:03:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 1
Created version '1' of model 'estate_model_rf'.
<ModelVersion: aliases=[], creation_timestamp=1727971394174, current_stage='None', description='', last_updated_timestamp=1727971394174, name='estate_model_rf', run_id='24e41bb582554f42953fe6dc2b6b190e', run_link='', source='mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b6b190e/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>
# Можно регистрировать сразу при создании прогона
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id
with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file,
pip_requirements= REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем
registered_model_name
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(pipeline.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
Registered model 'estate_model_rf' already exists. Creating a new version of this model...
2024/10/03 19:03:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 2
Created version '2' of model 'estate_model_rf'.
2024/10/03 19:03:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run register_at_run at: http://127.0.0.1:5000/#/experiments/1/runs/ed64a91759ed43c99329810d066ea95a.
2024/10/03 19:03:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
# Можно найти зарегистрированные модели
= mlflow.search_registered_models()
model_reg 0] model_reg[
<RegisteredModel: aliases={}, creation_timestamp=1727971371173, description='', last_updated_timestamp=1727971394354, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1727971394354, current_stage='None', description='', last_updated_timestamp=1727971394354, name='estate_model_rf', run_id='ed64a91759ed43c99329810d066ea95a', run_link='', source='mlflow-artifacts:/1/ed64a91759ed43c99329810d066ea95a/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>], name='estate_model_rf', tags={}>
= REGISTRY_MODEL_NAME
model_name = 1
model_version
= mlflow.sklearn.load_model(model_uri=f"models:/{model_name}/{model_version}") model_loaded
0:1]) model_loaded.predict(X_test.iloc[
array([3438055.97819847])
0] y_test.iloc[
np.int64(3062900)