89 KiB

Исходник Вина История Unescape Escape

import os
import mlflow

from sklearn.model_selection import train_test_split
import pandas as pd
import numpy

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error

df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 539355 entries, 1979096 to 5189500
Data columns (total 12 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   price          539355 non-null  int64   
 1   geo_lat        539355 non-null  float32 
 2   geo_lon        539355 non-null  float32 
 3   region         539355 non-null  category
 4   building_type  539355 non-null  category
 5   level          539355 non-null  int8    
 6   levels         539355 non-null  int8    
 7   rooms          539355 non-null  int8    
 8   area           539355 non-null  float16 
 9   kitchen_area   539355 non-null  float16 
 10  object_type    539355 non-null  category
 11  floor_level    539355 non-null  object  
dtypes: category(3), float16(2), float32(2), int64(1), int8(3), object(1)
memory usage: 21.6+ MB

df = df.rename(columns={'price': 'target'})

df

/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/pandas/io/formats/format.py:1458: RuntimeWarning: overflow encountered in cast
  has_large_values = (abs_vals > 1e6).any()

	target	geo_lat	geo_lon	region	building_type	level	levels	rooms	area	kitchen_area	object_type	floor_level
1979096	1300000	52.821098	83.113037	6817	1	1	1	3	66.50000	10.000000	1	first
1833303	8800000	55.707539	37.467068	3	1	15	16	2	46.00000	7.000000	1	hi
1494335	1958000	54.988400	82.783691	9654	2	13	17	1	36.50000	11.960938	11	hi
2747476	1461600	53.298553	50.326382	3106	3	5	5	1	32.59375	9.601562	11	last
5027275	3000000	42.897934	47.624825	4007	3	4	10	2	70.00000	12.000000	11	mid
...	...	...	...	...	...	...	...	...	...	...	...	...
2476626	1490000	54.943806	82.957870	9654	1	2	10	1	48.06250	14.000000	11	low
1487454	19000000	55.772240	37.731136	3	3	4	12	3	100.00000	13.000000	1	mid
2772844	1200000	54.474590	53.531807	2722	1	5	9	1	32.09375	7.000000	1	mid
3982304	2300000	55.378265	39.053310	81	1	1	5	2	49.00000	9.000000	1	first
5189500	9157730	55.542957	37.479919	3	1	8	17	2	52.31250	17.593750	11	mid

539355 rows × 12 columns

X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)

cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features

['region', 'building_type', 'object_type', 'floor_level']

num_features = X_train.select_dtypes(include=['number']).columns.to_list()
num_features

['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']

https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования

s_scaler = StandardScaler()
l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999) # unknown_value нужно выбирать с умом
regressor = RandomForestRegressor(n_estimators=20, max_depth=10)

Column transformer

# Для удобной работы со столбцами
preprocessor = ColumnTransformer(
    transformers=[
        ('num', s_scaler, num_features),  # преобразования для числовых признаков
        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков
    ],
    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования


pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor)])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['geo_lat', 'geo_lon',
                                                   'level', 'levels', 'rooms',
                                                   'area', 'kitchen_area']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=999),
                                                  ['region', 'building_type',
                                                   'object_type',
                                                   'floor_level'])])),
                ('model',
                 RandomForestRegressor(max_depth=10, n_estimators=20))])

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Pipeline?Documentation for PipelineiFitted

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['geo_lat', 'geo_lon',
                                                   'level', 'levels', 'rooms',
                                                   'area', 'kitchen_area']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=999),
                                                  ['region', 'building_type',
                                                   'object_type',
                                                   'floor_level'])])),
                ('model',
                 RandomForestRegressor(max_depth=10, n_estimators=20))])

preprocessor: ColumnTransformer?Documentation for preprocessor: ColumnTransformer

ColumnTransformer(transformers=[('num', StandardScaler(),
                                 ['geo_lat', 'geo_lon', 'level', 'levels',
                                  'rooms', 'area', 'kitchen_area']),
                                ('cat',
                                 OrdinalEncoder(handle_unknown='use_encoded_value',
                                                unknown_value=999),
                                 ['region', 'building_type', 'object_type',
                                  'floor_level'])])

num

['geo_lat', 'geo_lon', 'level', 'levels', 'rooms', 'area', 'kitchen_area']

StandardScaler?Documentation for StandardScaler

StandardScaler()

cat

['region', 'building_type', 'object_type', 'floor_level']

OrdinalEncoder?Documentation for OrdinalEncoder

OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=999)

RandomForestRegressor?Documentation for RandomForestRegressor

RandomForestRegressor(max_depth=10, n_estimators=20)

predictions = pipeline.predict(X_test) 

metrics = {}
metrics["mae"] = mean_absolute_error(y_test, predictions)   
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)

metrics

{'mae': np.float64(1276343.108894747),
 'mape': np.float64(0.35471390164231303),
 'mse': np.float64(174567675833231.12)}


# Работаем с MLflow локально
TRACKING_SERVER_HOST = "127.0.0.1"
TRACKING_SERVER_PORT = 5000

registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"

mlflow.set_tracking_uri(tracking_uri)   
mlflow.set_registry_uri(registry_uri)

# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
EXPERIMENT_NAME = "estate_project"
RUN_NAME = "baseline model"
REGISTRY_MODEL_NAME = "estate_model_rf"

Логируем вручную

# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature

signature =  infer_signature(model_input = X_train.head(5))
input_example = X_train.head(5)

/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
  warnings.warn(

# Будем логировать requirements и артефакт - текстовый файл
req_file = 'requirements.txt'
art = 'comment.txt'

# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
#params_dict = {'n_estimators': 10, 'max_depth': 10}
params_dict = pipeline.get_params()

# Когда создаем новый эксперимент, то: 
experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)

# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id

with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
    # получаем уникальный идентификатор запуска эксперимента
    run_id = run.info.run_id 
    mlflow.sklearn.log_model(pipeline, 
                             artifact_path="models",
                             signature=signature,
                             input_example=input_example,
                             pip_requirements=req_file
                             )
    mlflow.log_metrics(metrics)
    mlflow.log_artifact(art)
    mlflow.log_params(params_dict)

run = mlflow.get_run(run_id) 
assert (run.info.status =='FINISHED')

2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run baseline model at: http://127.0.0.1:5000/#/experiments/1/runs/24e41bb582554f42953fe6dc2b6b190e.
2024/10/03 18:59:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.

Удаление runs, experiments

Использовать осторожно

experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
#mlflow.delete_experiment(experiment_id)

mlflow.search_runs(
    #experiment_ids=[experiment_id],
    experiment_names=[EXPERIMENT_NAME],
    # filter_string='status = "FAILED"'
    #filter_string='metrics.mae > 1'
    
)

	run_id	experiment_id	status	artifact_uri	start_time	end_time	metrics.mae	metrics.mape	metrics.mse	params.preprocessor__cat__handle_unknown	...	params.model__max_samples	params.preprocessor__transformers	params.model__monotonic_cst	params.model__warm_start	params.preprocessor__remainder	tags.mlflow.user	tags.mlflow.source.type	tags.mlflow.runName	tags.mlflow.source.name	tags.mlflow.log-model.history
0	24e41bb582554f42953fe6dc2b6b190e	1	FINISHED	mlflow-artifacts:/1/24e41bb582554f42953fe6dc2b...	2024-10-03 15:59:12.732000+00:00	2024-10-03 15:59:13.921000+00:00	1.276343e+06	0.354714	1.745677e+14	use_encoded_value	...	None	[('num', StandardScaler(), ['geo_lat', 'geo_lo...	None	False	drop	andrey	LOCAL	baseline model	/home/andrey/work/institute/MLE/assets/mlflow/...	[{"run_id": "24e41bb582554f42953fe6dc2b6b190e"...

1 rows × 57 columns


#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')

Автологирование

После включения будет срабатывать на каждом обучении модели (на методе fit()).

Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную

mlflow.sklearn.autolog()

with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:
    pipeline.fit(X_train, y_train)

2024/10/03 18:59:14 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.
2024/10/03 19:02:16 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/10/03 19:02:40 WARNING mlflow.utils.autologging_utils: MLflow autologging encountered a warning: "/home/andrey/work/institute/MLE/assets/mlflow/.venv_lec_mlflow/lib/python3.10/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details."
2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🏃 View run auto at: http://127.0.0.1:5000/#/experiments/1/runs/2ced09116c264623b89d8df7fe33cb10.
2024/10/03 19:02:42 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.

# Отключаем автологирование
mlflow.sklearn.autolog(disable=True)

Model #2

Обучим вторую "маленькую" модель

regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)

pipeline = Pipeline(steps=[('preprocessor', preprocessor), 
                           ('model', regressor2)])

pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  ['geo_lat', 'geo_lon',
                                                   'level', 'levels', 'rooms',
                                                   'area', 'kitchen_area']),
                                                 ('cat',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=999),
                                                  ['region', 'building_type',
                                                   'object_type',
                                                   'floor_level'])])),
                ('model', RandomForestRegressor(max_depth=6, n_estimators=10))])