Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

89 KiB

import os
import mlflow

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 539355 entries, 1979096 to 5189500
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   price          539355 non-null  int64   
 1   geo_lat        539355 non-null  float32 
 2   geo_lon        539355 non-null  float32 
 3   region         539355 non-null  category
 4   building_type  539355 non-null  category
 5   level          539355 non-null  int8    
 6   levels         539355 non-null  int8    
 7   rooms          539355 non-null  int8    
 8   area           539355 non-null  float16 
 9   kitchen_area   539355 non-null  float16 
 10  object_type    539355 non-null  category
 11  floor_level    539355 non-null  object  
dtypes: category(3), float16(2), float32(2), int64(1), int8(3), object(1)
memory usage: 21.6+ MB
df = df.rename(columns={'price': 'target'})
df
/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/pandas/io/formats/format.py:1458: RuntimeWarning: overflow encountered in cast
  has_large_values = (abs_vals > 1e6).any()
target geo_lat geo_lon region building_type level levels rooms area kitchen_area object_type floor_level
1979096 1300000 52.821098 83.113037 6817 1 1 1 3 66.50000 10.000000 1 first
1833303 8800000 55.707539 37.467068 3 1 15 16 2 46.00000 7.000000 1 hi
1494335 1958000 54.988400 82.783691 9654 2 13 17 1 36.50000 11.960938 11 hi
2747476 1461600 53.298553 50.326382 3106 3 5 5 1 32.59375 9.601562 11 last
5027275 3000000 42.897934 47.624825 4007 3 4 10 2 70.00000 12.000000 11 mid
... ... ... ... ... ... ... ... ... ... ... ... ...
2476626 1490000 54.943806 82.957870 9654 1 2 10 1 48.06250 14.000000 11 low
1487454 19000000 55.772240 37.731136 3 3 4 12 3 100.00000 13.000000 1 mid
2772844 1200000 54.474590 53.531807 2722 1 5 9 1 32.09375 7.000000 1 mid
3982304 2300000 55.378265 39.053310 81 1 1 5 2 49.00000 9.000000 1 first
5189500 9157730 55.542957 37.479919 3 1 8 17 2 52.31250 17.593750 11 mid

539355 rows × 12 columns

X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)
cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features
['region', 'building_type', 'object_type', 'floor_level']
num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features
['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']

https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования

s_scaler = StandardScaler()
l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999) # unknown_value нужно выбирать с умом
regressor = RandomForestRegressor(n_estimators=20, max_depth=10)

Column transformer

# Для удобной работы со столбцами
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования

pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor)])

pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['geo_lat', 'geo_lon',
                                                   'level', 'levels', 'rooms',
                                                   'area', 'kitchen_area']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=999),
                                                  ['region', 'building_type',
                                                   'object_type',
                                                   'floor_level'])])),
                ('model',
                 RandomForestRegressor(max_depth=10, n_estimators=20))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['geo_lat', 'geo_lon',
                                                   'level', 'levels', 'rooms',
                                                   'area', 'kitchen_area']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=999),
                                                  ['region', 'building_type',
                                                   'object_type',
                                                   'floor_level'])])),
                ('model',
                 RandomForestRegressor(max_depth=10, n_estimators=20))])
ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['geo_lat', 'geo_lon', 'level', 'levels',
                                  'rooms', 'area', 'kitchen_area']),
                                ('cat',
                                 OrdinalEncoder(handle_unknown='use_encoded_value',
                                                unknown_value=999),
                                 ['region', 'building_type', 'object_type',
                                  'floor_level'])])
['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']
StandardScaler()
['region', 'building_type', 'object_type', 'floor_level']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999)
RandomForestRegressor(max_depth=10, n_estimators=20)
predictions = pipeline.predict(X_test) 

metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics
{'mae': np.float64(1276343.108894747),
 'mape': np.float64(0.35471390164231303),
 'mse': np.float64(174567675833231.12)}

# Работаем с MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri)   
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
EXPERIMENT_NAME = "estate_project"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

Логируем вручную

# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)
/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
  warnings.warn(
# Будем логировать requirements и артефакт - текстовый файл
req_file = 'requirements.txt'
art = 'comment.txt'
# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
#params_dict = {'n_estimators': 10, 'max_depth': 10}
params_dict = pipeline.get_params()
# Когда создаем новый эксперимент, то: 
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')
2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/24e41bb582554f42953fe6dc2b6b190e.
2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.

Удаление runs, experiments

Использовать осторожно

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
#mlflow.delete_experiment(experiment_id)
mlflow.search_runs(
    #experiment_ids=[experiment_id],
    experiment_names=[EXPERIMENT_NAME],
    # filter_string='status = "FAILED"'
    #filter_string='metrics.mae > 1'
    
)
run_id experiment_id status artifact_uri start_time end_time metrics.mae metrics.mape metrics.mse params.preprocessor__cat__handle_unknown ... params.model__max_samples params.preprocessor__transformers params.model__monotonic_cst params.model__warm_start params.preprocessor__remainder tags.mlflow.user tags.mlflow.source.type tags.mlflow.runName tags.mlflow.source.name tags.mlflow.log-model.history
0 24e41bb582554f42953fe6dc2b6b190e 1 FINISHED mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b... 2024-10-03 15:59:12.732000+00:00 2024-10-03 15:59:13.921000+00:00 1.276343e+06 0.354714 1.745677e+14 use_encoded_value ... None [('num', StandardScaler(), ['geo_lat', 'geo_lo... None False drop andrey LOCAL baseline model /home/andrey/work/institute/MLE/assets/mlflow/... [{"run_id": "24e41bb582554f42953fe6dc2b6b190e"...

1 rows × 57 columns


#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')

Автологирование

После включения будет срабатывать на каждом обучении модели (на методе fit()).

Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную

mlflow.sklearn.autolog()

with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:
    pipeline.fit(X_train, y_train)
2024/10/03 18:59:14 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.
2024/10/03 19:02:16 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/10/03 19:02:40 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run auto at: http://127.0.0.1:5000/#/experiments/1/runs/2ced09116c264623b89d8df7fe33cb10.
2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
# Отключаем автологирование
mlflow.sklearn.autolog(disable=True)

Model #2

Обучим вторую "маленькую" модель

regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)
pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor2)])

pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['geo_lat', 'geo_lon',
                                                   'level', 'levels', 'rooms',
                                                   'area', 'kitchen_area']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=999),
                                                  ['region', 'building_type',
                                                   'object_type',
                                                   'floor_level'])])),
                ('model', RandomForestRegressor(max_depth=6, n_estimators=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['geo_lat', 'geo_lon',
                                                   'level', 'levels', 'rooms',
                                                   'area', 'kitchen_area']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=999),
                                                  ['region', 'building_type',
                                                   'object_type',
                                                   'floor_level'])])),
                ('model', RandomForestRegressor(max_depth=6, n_estimators=10))])
ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['geo_lat', 'geo_lon', 'level', 'levels',
                                  'rooms', 'area', 'kitchen_area']),
                                ('cat',
                                 OrdinalEncoder(handle_unknown='use_encoded_value',
                                                unknown_value=999),
                                 ['region', 'building_type', 'object_type',
                                  'floor_level'])])
['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']
StandardScaler()
['region', 'building_type', 'object_type', 'floor_level']
OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999)
RandomForestRegressor(max_depth=6, n_estimators=10)
predictions = pipeline.predict(X_test) 
metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics
{'mae': np.float64(1536543.887713661),
 'mape': np.float64(0.42528854535519156),
 'mse': np.float64(210549541556055.7)}
# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй "маленькой" модели. 


RUN_NAME = 'smaller_model'
# Когда создаем новый эксперимент, то: 
#experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(pipeline.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')
2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run smaller_model at: http://127.0.0.1:5000/#/experiments/1/runs/20f66bd4c3754a04b5e47ecc0f577e76.
2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
# No model
# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA

RUN_NAME = 'no_model'
experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    run_id = run.info.run_id 
    mlflow.log_artifact(art)


run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')
2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🏃 View run no_model at: http://127.0.0.1:5000/#/experiments/1/runs/6f6fe970eb74485d866e918b733f8f61.
2024/10/03 19:02:51 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
run_id = '' # Указываем run id
mlflow.register_model(f"runs:/{run_id}/models", REGISTRY_MODEL_NAME)
Registered model 'estate_model_rf' already exists. Creating a new version of this model...
2024/10/03 19:03:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 1
Created version '1' of model 'estate_model_rf'.
<ModelVersion: aliases=[], creation_timestamp=1727971394174, current_stage='None', description='', last_updated_timestamp=1727971394174, name='estate_model_rf', run_id='24e41bb582554f42953fe6dc2b6b190e', run_link='', source='mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b6b190e/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>
# Можно регистрировать сразу при создании прогона

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file,
                             registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(pipeline.get_params())

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')
Registered model 'estate_model_rf' already exists. Creating a new version of this model...
2024/10/03 19:03:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: estate_model_rf, version 2
Created version '2' of model 'estate_model_rf'.
2024/10/03 19:03:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run register_at_run at: http://127.0.0.1:5000/#/experiments/1/runs/ed64a91759ed43c99329810d066ea95a.
2024/10/03 19:03:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
# Можно найти зарегистрированные модели
model_reg = mlflow.search_registered_models()
model_reg[0]
<RegisteredModel: aliases={}, creation_timestamp=1727971371173, description='', last_updated_timestamp=1727971394354, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1727971394354, current_stage='None', description='', last_updated_timestamp=1727971394354, name='estate_model_rf', run_id='ed64a91759ed43c99329810d066ea95a', run_link='', source='mlflow-artifacts:/1/ed64a91759ed43c99329810d066ea95a/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='2'>], name='estate_model_rf', tags={}>

model_name = REGISTRY_MODEL_NAME
model_version = 1

model_loaded = mlflow.sklearn.load_model(model_uri=f"models:/{model_name}/{model_version}")
model_loaded.predict(X_test.iloc[0:1])
array([3438055.97819847])
y_test.iloc[0]
np.int64(3062900)