164 KiB
import os
import mlflow
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error
= pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции
df df.info()
= df.rename(columns={'price': 'target'})
df = df.drop(columns=['date', 'time']) df
df
= train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2) X_train, X_test, y_train, y_test
= X_train.select_dtypes(include=['category','object']).columns.to_list()
cat_features cat_features
= X_train.select_dtypes(include=['number']).columns.to_list()
num_features num_features
https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования
= StandardScaler()
s_scaler = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом
l_encoder = CatBoostRegressor() regressor
Column transformer
# Для удобной работы со столбцами
= ColumnTransformer(
preprocessor =[
transformers'num', s_scaler, num_features), # преобразования для числовых признаков
('cat', l_encoder, cat_features), # преобразования для категориальных признаков
(
],='drop' ) # Удаляем столбцы, которые не затронуты преобразования remainder
= Pipeline(steps=[('preprocessor', preprocessor),
pipeline 'model', regressor)])
(
pipeline.fit(X_train, y_train)
= pipeline.predict(X_test)
predictions
= {}
metrics "mae"] = mean_absolute_error(y_test, predictions)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)
metrics[
metrics
# Работаем с MLflow локально
= "127.0.0.1"
TRACKING_SERVER_HOST = 5000
TRACKING_SERVER_PORT
= f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
registry_uri = f"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}"
tracking_uri
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_registry_uri(registry_uri)
# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться
= "estate_project"
EXPERIMENT_NAME = "baseline model"
RUN_NAME = "estate_model_rf" REGISTRY_MODEL_NAME
Логируем вручную
# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их
from mlflow.models import infer_signature
= infer_signature(model_input = X_train.head(5))
signature = X_train.head(5) input_example
# Будем логировать requirements и артефакт - текстовый файл
= 'requirements.txt'
req_file = 'comment.txt' art
# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели
#params_dict = {'n_estimators': 10, 'max_depth': 10}
= pipeline.get_params() params_dict
# Когда создаем новый эксперимент, то:
= mlflow.create_experiment(EXPERIMENT_NAME)
experiment_id
# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:
#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(params_dict)
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
Удаление runs, experiments
Использовать осторожно
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id #mlflow.delete_experiment(experiment_id)
mlflow.search_runs(#experiment_ids=[experiment_id],
=[EXPERIMENT_NAME],
experiment_names# filter_string='status = "FAILED"'
#filter_string='metrics.mae > 1'
)
#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')
Автологирование
После включения будет срабатывать на каждом обучении модели (на методе fit()).
Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную
mlflow.sklearn.autolog()
with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:
pipeline.fit(X_train, y_train)
# Отключаем автологирование
=True) mlflow.sklearn.autolog(disable
Model #2
Обучим вторую "маленькую" модель
= RandomForestRegressor(n_estimators=10, max_depth=6) regressor2
= Pipeline(steps=[('preprocessor', preprocessor),
pipeline 'model', regressor2)])
(
pipeline.fit(X_train, y_train)
= pipeline.predict(X_test)
predictions = {}
metrics "mae"] = mean_absolute_error(y_test, predictions)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)
metrics[
metrics
# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй "маленькой" модели.
= 'smaller_model'
RUN_NAME
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(pipeline.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
# No model
# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA
= 'no_model'
RUN_NAME = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
= run.info.run_id
run_id
mlflow.log_artifact(art)
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
= '06fa7ec1f1b74aedb3509c88dc4ee1c0' # Указываем run id
run_id f"runs:/{run_id}/models", REGISTRY_MODEL_NAME) mlflow.register_model(
# Можно регистрировать сразу при создании прогона
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id
with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file,
pip_requirements= REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем
registered_model_name
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(pipeline.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
# Можно найти зарегистрированные модели
= mlflow.search_registered_models()
model_reg 0] model_reg[
= REGISTRY_MODEL_NAME
model_name = 1
model_version
= mlflow.sklearn.load_model(model_uri=f"models:/{model_name}/{model_version}") model_loaded
0:1]) model_loaded.predict(X_test.iloc[
0] y_test.iloc[
Feature engineering
Sklearn
from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler
= X_train.copy() X_train_sklearn
PolynomialFeatures
Создает полином степени degree
из указанных признаков
= PolynomialFeatures(degree=2) pf
X_train_sklearn
'area','kitchen_area']]) pf.fit_transform(X_train_sklearn[[
SplineTransformer
Cоздаёт новую матрицу признаков, состоящую из сплайнов порядка degree. Количество сгенерированных сплайнов равно n_splines=n_knots + degree - 1
для каждого признака, где
n_knots
определяет количество узлов (точек, в которых сопрягаются сплайны) для каждого признака.
degree
определяет порядок полинома, используемого для построения сплайнов.
= SplineTransformer(n_knots=3, degree=3) sp
'area']]) sp.fit_transform(X_train_sklearn[[
QuantileTransformer
Этот метод преобразует признаки, чтобы они распределялись равномерно или нормально — так данные меньше подвергаются влиянию выбросов. Преобразование применяется к каждому признаку независимо. Идея метода такова: оценить функцию распределения признака, чтобы преобразовать исходные значения в равномерное или нормальное распределение.
output_distribution='uniform'
или output_distribution='normal'
соответственно
Пример использования: если у вас есть данные о доходах с широким диапазоном значений, квантильное преобразование сделает их более сопоставимыми и устойчивыми к выбросам.
= QuantileTransformer() qt
'area']]) qt.fit_transform(X_train_sklearn[[
Объединяем в ColumnTransformer и создаем Pipeline
= PolynomialFeatures(degree=2)
pf = QuantileTransformer()
qt = SplineTransformer(n_knots=3, degree=3) sp
# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг
= Pipeline(steps=[
pf_pipeline 'poly', pf),
('scale', StandardScaler())
( ])
= ColumnTransformer(
preprocessor_sklearn =[
transformers'num', s_scaler, num_features), # преобразования для числовых признаков
('cat', l_encoder, cat_features), # преобразования для категориальных признаков
('quantile', qt,num_features),
('poly', pf_pipeline, ['area', 'kitchen_area']), # В преобразования добавляем созданный ранее pipeline
('spline', sp, ['area'])
(
],='drop',
remainder# Удаляем столбцы, которые не затронуты преобразования )
Посмотрим что из себя теперь представляет датафрейм
## не влезаем в float64 в полиномальном преобразовании. Использовать его нужно с умом!
'area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')
X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128') X_train_sklearn[[
= preprocessor_sklearn.fit_transform(X_train_sklearn)
X_train_sklearn_raw = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out()) X_train_sklearn
# Удобно использовать для отображения всех строк\столбцов в DataFrame
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
display (X_train_sklearn)
num__geo_lat | num__geo_lon | num__level | num__levels | num__rooms | num__area | num__kitchen_area | cat__region | cat__building_type | cat__object_type | quantile__geo_lat | quantile__geo_lon | quantile__level | quantile__levels | quantile__rooms | quantile__area | quantile__kitchen_area | poly__1 | poly__area | poly__kitchen_area | poly__area^2 | poly__area kitchen_area | poly__kitchen_area^2 | spline__area_sp_0 | spline__area_sp_1 | spline__area_sp_2 | spline__area_sp_3 | spline__area_sp_4 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.495902 | -0.449742 | 0.359235 | -0.214789 | 0.253413 | 0.063735 | -0.186285 | 20.0 | 1.0 | 0.0 | 0.766257 | 0.511028 | 0.717217 | 0.536537 | 0.600601 | 0.623624 | 0.374875 | 0.0 | 0.063735 | -0.186285 | -0.010002 | -0.132188 | -0.002792 | 0.155806 | 0.666179 | 0.178013 | 0.000002 | 0.0 |
1 | 0.177806 | 1.433673 | -0.246529 | -0.367718 | 0.253413 | -0.114293 | -0.186285 | 70.0 | 1.0 | 0.0 | 0.297142 | 0.867999 | 0.522022 | 0.386887 | 0.600601 | 0.541542 | 0.374875 | 0.0 | -0.114293 | -0.186285 | -0.017375 | -0.169370 | -0.002792 | 0.156921 | 0.666275 | 0.176803 | 0.000001 | 0.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
410773 | -0.748366 | -0.804077 | -0.650371 | 0.702788 | 0.253413 | 1.365441 | 1.501833 | 52.0 | 3.0 | 0.0 | 0.193143 | 0.114753 | 0.309810 | 0.741742 | 0.600601 | 0.961367 | 0.984535 | 0.0 | 1.365441 | 1.501833 | 0.068438 | 1.570163 | 0.008616 | 0.147820 | 0.665159 | 0.187011 | 0.000010 | 0.0 |
410774 | 1.257769 | -1.101815 | -0.044608 | 0.091070 | 1.175911 | 0.553789 | -0.142544 | 14.0 | 1.0 | 0.0 | 0.908036 | 0.075725 | 0.604605 | 0.645646 | 0.867367 | 0.841842 | 0.436436 | 0.0 | 0.553789 | -0.142544 | 0.014463 | -0.002742 | -0.002649 | 0.152767 | 0.665860 | 0.181370 | 0.000004 | 0.0 |
410775 rows × 28 columns
Создаем пайплайн с препроцессингом и моделью
= Pipeline(steps=[
pipeline_sklearn 'transform', preprocessor_sklearn),
('model', regressor)
(
])
= pipeline_sklearn.fit(X_train, y_train) model_sklearn
model_sklearn
= model_sklearn.predict(X_test)
predictions = {}
metrics "mae"] = mean_absolute_error(y_test, predictions)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)
metrics[
metrics
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id = 'fe_sklearn'
RUN_NAME
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(model_sklearn, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(model_sklearn.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
Autofeat
from autofeat import AutoFeatRegressor
= ["1/", "exp", "log", "abs", "sqrt", "^2", "^3", "1+", "1-", "sin", "cos", "exp-", "2^"] transformations
= AutoFeatRegressor(verbose=1, feateng_steps=2, max_gb=8, transformations=["log", "sqrt"],feateng_cols=num_features)
afreg = afreg.fit_transform(X_train,y_train)
X_train_arf X_train_arf
# Создаем обертку, в которой добавляем метод get_feature_names_out() для получения названий признаков
import numpy as np
class AutoFeatWrapper():
def __init__(self, feateng_cols, feateng_steps=1, max_gb=16, transformations=["1/", "exp", "log"], n_jobs=-1, verbose=1):
self.feateng_cols = feateng_cols
self.feateng_steps = feateng_steps
self.max_gb = max_gb
self.transformations = transformations
self.n_jobs = n_jobs
self.afreg = AutoFeatRegressor(feateng_cols=self.feateng_cols,
=self.feateng_steps,
feateng_steps=self.max_gb,
max_gb=self.transformations,
transformations=self.n_jobs)
n_jobs
def fit(self, X, y=None):
self.afreg.fit(X, y)
return self
def transform(self, X):
return self.afreg.transform(X)
def get_feature_names_out(self, input_features=None):
# Преобразуем данные и возвращаем имена фичей из DataFrame
= self.afreg.transform(pd.DataFrame(np.zeros((1, len(self.feateng_cols))), columns=self.feateng_cols))
transformed_X return transformed_X.columns.tolist()
= Pipeline(steps=[
afreg_pipeline 'autofeat', AutoFeatWrapper( feateng_steps=2, max_gb=16, transformations=["log", "sqrt"],feateng_cols=num_features)),
('scaler', StandardScaler()),
( ])
= ColumnTransformer(
preprocessor_afr =[
transformers'num', s_scaler, num_features), # преобразования для числовых признаков
('cat', l_encoder, cat_features), # преобразования для категориальных признаков
('afr', afreg_pipeline, num_features), # преобразования autofeat
(
],='drop', # Удаляем столбцы, которые не затронуты преобразованиями
remainder )
= preprocessor_afr.fit_transform(X_train,y_train)
X_train_afr_raw = pd.DataFrame(X_train_afr_raw, columns=preprocessor_afr.get_feature_names_out()) X_train_afr
with pd.option_context('display.max_rows', 5, 'display.max_columns', None):
display (X_train_afr)
num__geo_lat | num__geo_lon | num__level | num__levels | num__rooms | num__area | num__kitchen_area | cat__region | cat__building_type | cat__object_type | afr__geo_lat | afr__geo_lon | afr__level | afr__levels | afr__rooms | afr__area | afr__kitchen_area | afr__area*rooms | afr__area*geo_lon | afr__levels*rooms | afr__area*kitchen_area | afr__sqrt(area)*geo_lat | afr__sqrt(area)*log(level) | afr__kitchen_area*log(level) | afr__sqrt(area)*kitchen_area | afr__geo_lon*log(kitchen_area) | afr__sqrt(area)*sqrt(kitchen_area) | afr__sqrt(geo_lon)*sqrt(kitchen_area) | afr__log(area) | afr__rooms*log(level) | afr__kitchen_area*rooms | afr__kitchen_area*levels | afr__sqrt(geo_lon)*sqrt(level) | afr__area**(3/2) | afr__geo_lat*log(kitchen_area) | afr__geo_lat*log(geo_lon) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.495902 | -0.449742 | 0.359235 | -0.214789 | 0.253413 | 0.063735 | -0.186285 | 20.0 | 1.0 | 0.0 | 0.495902 | -0.449742 | 0.359235 | -0.214789 | 0.253413 | 0.063735 | -0.186285 | 0.006208 | -0.195129 | 0.060916 | -0.132188 | 0.373151 | 0.688076 | 0.044178 | -0.211335 | -0.481294 | -0.153548 | -0.490805 | 0.307835 | 0.690329 | -0.132529 | -0.352834 | 0.323880 | -0.008748 | -0.031529 | 0.068167 |
1 | 0.177806 | 1.433673 | -0.246529 | -0.367718 | 0.253413 | -0.114293 | -0.186285 | 70.0 | 1.0 | 0.0 | 0.177806 | 1.433673 | -0.246529 | -0.367718 | 0.253413 | -0.114293 | -0.186285 | -0.083402 | 0.655053 | -0.054279 | -0.169370 | 0.005114 | 0.071369 | -0.173647 | -0.252775 | 1.191304 | -0.267268 | 0.615798 | 0.031907 | 0.282625 | -0.132529 | -0.418643 | 0.552794 | -0.056540 | -0.143829 | 1.129118 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
410773 | -0.748366 | -0.804077 | -0.650371 | 0.702788 | 0.253413 | 1.365441 | 1.501833 | 52.0 | 3.0 | 0.0 | -0.748366 | -0.804077 | -0.650371 | 0.702788 | 0.253413 | 1.365441 | 1.501833 | 0.661427 | 0.375199 | 0.752088 | 1.570163 | 1.274445 | -0.002521 | 0.745507 | 2.382258 | 0.071599 | 2.828890 | 1.431272 | 1.729715 | -0.160491 | 1.581436 | 2.432437 | -0.843150 | 0.411475 | 1.671069 | -1.052343 |
410774 | 1.257769 | -1.101815 | -0.044608 | 0.091070 | 1.175911 | 0.553789 | -0.142544 | 14.0 | 1.0 | 0.0 | 1.257769 | -1.101815 | -0.044608 | 0.091070 | 1.175911 | 0.553789 | -0.142544 | 0.807887 | -0.330070 | 0.982478 | -0.002742 | 1.338996 | 0.635065 | -0.040302 | -0.055435 | -1.025588 | 0.202136 | -0.916054 | 0.940624 | 1.217910 | 0.311575 | -0.174762 | -0.415359 | 0.135617 | 0.359680 | -0.246790 |
410775 rows × 36 columns
= Pipeline(steps=[('preprocessor', preprocessor_afr),
pipeline_afr 'model', regressor)])
(
pipeline_afr.fit(X_train, y_train)
= pipeline_afr.predict(X_test)
predictions
= {}
metrics "mae"] = mean_absolute_error(y_test, predictions)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions)
metrics["mse"] = mean_squared_error(y_test, predictions)
metrics[
metrics
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id
with mlflow.start_run(run_name='autofeat', experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(pipeline_afr, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(pipeline_afr.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
FEATURE SELECTION
RFE
Используем autofeat признаки
Поскольку autofeat дает разные совокупности сгенерированных признаков, мы можем добавить выбор информативных только как шаг пайплайна
from sklearn.feature_selection import RFE
X_train_afr
num__geo_lat | num__geo_lon | num__level | num__levels | num__rooms | num__area | num__kitchen_area | cat__region | cat__building_type | cat__object_type | ... | afr__sqrt(area)*sqrt(kitchen_area) | afr__sqrt(geo_lon)*sqrt(kitchen_area) | afr__log(area) | afr__rooms*log(level) | afr__kitchen_area*rooms | afr__kitchen_area*levels | afr__sqrt(geo_lon)*sqrt(level) | afr__area**(3/2) | afr__geo_lat*log(kitchen_area) | afr__geo_lat*log(geo_lon) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.495902 | -0.449742 | 0.359235 | -0.214789 | 0.253413 | 0.063735 | -0.186285 | 20.0 | 1.0 | 0.0 | ... | -0.153548 | -0.490805 | 0.307835 | 0.690329 | -0.132529 | -0.352834 | 0.323880 | -0.008748 | -0.031529 | 0.068167 |
1 | 0.177806 | 1.433673 | -0.246529 | -0.367718 | 0.253413 | -0.114293 | -0.186285 | 70.0 | 1.0 | 0.0 | ... | -0.267268 | 0.615798 | 0.031907 | 0.282625 | -0.132529 | -0.418643 | 0.552794 | -0.056540 | -0.143829 | 1.129118 |
2 | 0.440548 | 0.047222 | -0.448450 | -0.367718 | -0.669085 | -0.456947 | -0.142544 | 15.0 | 3.0 | 1.0 | ... | -0.454880 | -0.067183 | -0.603122 | -0.512211 | -0.487813 | -0.383803 | -0.243092 | -0.140800 | 0.063464 | 0.460495 |
3 | -1.588818 | -0.722477 | -0.246529 | -0.979436 | 0.253413 | -0.181292 | -0.142544 | 18.0 | 1.0 | 0.0 | ... | -0.254514 | -0.607607 | -0.080304 | 0.282625 | -0.088119 | -0.662523 | -0.369355 | -0.073838 | -0.672113 | -1.481033 |
4 | 1.493662 | 1.125819 | 0.157313 | 0.549858 | 0.253413 | 0.615045 | -0.011322 | 10.0 | 2.0 | 0.0 | ... | 0.438600 | 0.891383 | 1.009612 | 0.574497 | 0.045112 | 0.208478 | 0.945981 | 0.154902 | 0.780855 | 1.923382 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
410770 | 0.592011 | 0.355014 | 0.561156 | 1.008646 | 0.253413 | -0.079836 | -0.092653 | 54.0 | 2.0 | 0.0 | ... | -0.120035 | 0.237580 | 0.087725 | 0.792500 | -0.037463 | 0.322797 | 0.974381 | -0.047496 | 0.243018 | 0.789871 |
410771 | 0.240478 | 0.392697 | -0.650371 | -0.979436 | 0.253413 | -0.334434 | -0.404989 | 45.0 | 3.0 | 0.0 | ... | -0.716150 | -0.510766 | -0.357277 | -0.160491 | -0.354582 | -0.778657 | -0.406361 | -0.111897 | -0.808157 | 0.574534 |
410772 | -1.936771 | -0.688830 | 0.359235 | 0.855717 | -0.669085 | -0.456947 | -0.142544 | 18.0 | 0.0 | 1.0 | ... | -0.454880 | -0.581851 | -0.603122 | -0.211576 | -0.487813 | 0.173638 | 0.170166 | -0.140800 | -0.798234 | -1.663294 |
410773 | -0.748366 | -0.804077 | -0.650371 | 0.702788 | 0.253413 | 1.365441 | 1.501833 | 52.0 | 3.0 | 0.0 | ... | 2.828890 | 1.431272 | 1.729715 | -0.160491 | 1.581436 | 2.432437 | -0.843150 | 0.411475 | 1.671069 | -1.052343 |
410774 | 1.257769 | -1.101815 | -0.044608 | 0.091070 | 1.175911 | 0.553789 | -0.142544 | 14.0 | 1.0 | 0.0 | ... | 0.202136 | -0.916054 | 0.940624 | 1.217910 | 0.311575 | -0.174762 | -0.415359 | 0.135617 | 0.359680 | -0.246790 |
410775 rows × 36 columns
= RFE(estimator=regressor, n_features_to_select=12, step = 0.2) #drop 20% of features each iteration
rfe_selector = rfe_selector.fit_transform(X_train_afr,y_train) X_train_rfe
= pd.DataFrame(X_train_rfe, columns=rfe_selector.get_feature_names_out())
X_train_afr_rfe X_train_afr_rfe
num__geo_lat | num__geo_lon | afr__geo_lon | afr__area*kitchen_area | afr__sqrt(area)*geo_lat | afr__sqrt(area)*log(level) | afr__kitchen_area*log(level) | afr__sqrt(area)*sqrt(kitchen_area) | afr__rooms*log(level) | afr__kitchen_area*rooms | afr__sqrt(geo_lon)*sqrt(level) | afr__geo_lat*log(geo_lon) | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.495902 | -0.449742 | -0.449742 | -0.132188 | 0.373151 | 0.688076 | 0.044178 | -0.153548 | 0.690329 | -0.132529 | 0.323880 | 0.068167 |
1 | 0.177806 | 1.433673 | 1.433673 | -0.169370 | 0.005114 | 0.071369 | -0.173647 | -0.267268 | 0.282625 | -0.132529 | 0.552794 | 1.129118 |
2 | 0.440548 | 0.047222 | 0.047222 | -0.226261 | -0.425530 | -0.335537 | -0.239271 | -0.454880 | -0.512211 | -0.487813 | -0.243092 | 0.460495 |
3 | -1.588818 | -0.722477 | -0.722477 | -0.165302 | -0.723225 | 0.034116 | -0.129771 | -0.254514 | 0.282625 | -0.088119 | -0.369355 | -1.481033 |
4 | 1.493662 | 1.125819 | 1.125819 | 0.094342 | 1.522265 | 0.862773 | 0.194490 | 0.438600 | 0.574497 | 0.045112 | 0.945981 | 1.923382 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
410770 | 0.592011 | 0.355014 | 0.355014 | -0.120841 | 0.206926 | 0.714499 | 0.226990 | -0.120035 | 0.792500 | -0.037463 | 0.974381 | 0.789871 |
410771 | 0.240478 | 0.392697 | 0.392697 | -0.296252 | -0.297209 | -0.551021 | -0.560144 | -0.716150 | -0.160491 | -0.354582 | -0.406361 | 0.574534 |
410772 | -1.936771 | -0.688830 | -0.688830 | -0.226261 | -1.192706 | 0.306280 | 0.100868 | -0.454880 | -0.211576 | -0.487813 | 0.170166 | -1.663294 |
410773 | -0.748366 | -0.804077 | -0.804077 | 1.570163 | 1.274445 | -0.002521 | 0.745507 | 2.828890 | -0.160491 | 1.581436 | -0.843150 | -1.052343 |
410774 | 1.257769 | -1.101815 | -1.101815 | -0.002742 | 1.338996 | 0.635065 | -0.040302 | 0.202136 | 1.217910 | 0.311575 | -0.415359 | -0.246790 |
410775 rows × 12 columns
= Pipeline(steps=[
rfe_pipeline 'preprocessor', preprocessor_afr),
('rfe_extractor', RFE(estimator=regressor, n_features_to_select=12, step = 0.2)),
('model', regressor)
(
])
rfe_pipeline.fit(X_train, y_train)
= rfe_pipeline.predict(X_test)
predictions_rfe
= {}
metrics "mae"] = mean_absolute_error(y_test, predictions_rfe)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions_rfe)
metrics["mse"] = mean_squared_error(y_test, predictions_rfe)
metrics[
metrics
{'mae': 1431925.3203264712,
'mape': 1.239752923791043e+18,
'mse': 261947924998018.2}
= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id = 'rfe_feature_selection'
RUN_NAME
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(rfe_pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)
mlflow.log_artifact(art)
mlflow.log_params(model_sklearn.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 40.15it/s]
2024/10/17 14:26:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run rfe_feature_selection at: http://127.0.0.1:5000/#/experiments/1/runs/96f0bbcd6d88466abcf38f3b53f06ff1.
2024/10/17 14:26:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
Используем sklearn признаки
Тут мы можем отобрать признаки один раз на обучении, а далее в качестве шага пайплайна использовать написанный класс ColumnExtractor для выбора нуных столбцов
= RFE(estimator=regressor, n_features_to_select=12, step = 0.2) #drop 20% of features each iteration
rfe_skl_selector = rfe_skl_selector.fit_transform(X_train_sklearn,y_train) X_train_skl_rfe
= pd.DataFrame(X_train_skl_rfe, columns=rfe_skl_selector.get_feature_names_out())
X_train_skl_rfe X_train_skl_rfe
num__geo_lat | num__geo_lon | num__level | num__rooms | num__kitchen_area | cat__region | quantile__geo_lat | quantile__geo_lon | quantile__level | poly__area kitchen_area | spline__area_sp_0 | spline__area_sp_2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.495902 | -0.449742 | 0.359235 | 0.253413 | -0.186285 | 20.0 | 0.766257 | 0.511028 | 0.717217 | -0.132188 | 0.155806 | 0.178013 |
1 | 0.177806 | 1.433673 | -0.246529 | 0.253413 | -0.186285 | 70.0 | 0.297142 | 0.867999 | 0.522022 | -0.169370 | 0.156921 | 0.176803 |
2 | 0.440548 | 0.047222 | -0.448450 | -0.669085 | -0.142544 | 15.0 | 0.732330 | 0.629984 | 0.417417 | -0.226261 | 0.159080 | 0.174488 |
3 | -1.588818 | -0.722477 | -0.246529 | 0.253413 | -0.142544 | 18.0 | 0.148789 | 0.295262 | 0.522022 | -0.165302 | 0.157341 | 0.176349 |
4 | 1.493662 | 1.125819 | 0.157313 | 0.253413 | -0.011322 | 10.0 | 0.985937 | 0.758363 | 0.662663 | 0.094342 | 0.152390 | 0.181792 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
410770 | 0.592011 | 0.355014 | 0.561156 | 0.253413 | -0.092653 | 54.0 | 0.788393 | 0.686728 | 0.771271 | -0.120841 | 0.156705 | 0.177037 |
410771 | 0.240478 | 0.392697 | -0.650371 | 0.253413 | -0.404989 | 45.0 | 0.494062 | 0.717240 | 0.309810 | -0.296252 | 0.158306 | 0.175314 |
410772 | -1.936771 | -0.688830 | 0.359235 | -0.669085 | -0.142544 | 18.0 | 0.131352 | 0.327613 | 0.717217 | -0.226261 | 0.159080 | 0.174488 |
410773 | -0.748366 | -0.804077 | -0.650371 | 0.253413 | 1.501833 | 52.0 | 0.193143 | 0.114753 | 0.309810 | 1.570163 | 0.147820 | 0.187011 |
410774 | 1.257769 | -1.101815 | -0.044608 | 1.175911 | -0.142544 | 14.0 | 0.908036 | 0.075725 | 0.604605 | -0.002742 | 0.152767 | 0.181370 |
410775 rows × 12 columns
= X_train_skl_rfe.columns.tolist()
rfe_cols rfe_cols
['num__geo_lat',
'num__geo_lon',
'num__level',
'num__rooms',
'num__kitchen_area',
'cat__region',
'quantile__geo_lat',
'quantile__geo_lon',
'quantile__level',
'poly__area kitchen_area',
'spline__area_sp_0',
'spline__area_sp_2']
= rfe_skl_selector.support_
rfe_idx rfe_idx
array([ True, True, True, False, True, False, True, True, False,
False, True, True, True, False, False, False, False, False,
False, False, False, True, False, True, False, True, False,
False])
# Отбираемые столбцы нужно залогировать, иначе мы потеряем информацию о том, какие призныки выбраны
with open('rfe_skl_idx.txt', 'w+') as f:
str(rfe_idx))
f.write(with open('rfe_skl_cols.txt', 'w+') as f:
str(rfe_cols)) f.write(
class ColumnExtractor(object):
def __init__(self, cols):
self.cols = cols
def transform(self, X):
return X[:,self.cols]
def fit(self, X, y=None):
return self
= Pipeline(steps=[
rfe_skl_pipeline 'preprocessor', preprocessor_sklearn),
('rfe_extractor', ColumnExtractor(rfe_idx)),
('model', regressor)
(
])
rfe_skl_pipeline.fit(X_train, y_train)
= rfe_skl_pipeline.predict(X_test)
predictions_rfe_skl
= {}
metrics "mae"] = mean_absolute_error(y_test, predictions_rfe_skl)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions_rfe_skl)
metrics["mse"] = mean_squared_error(y_test, predictions_rfe_skl)
metrics[
metrics= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id = 'rfe_skl_feature_selection'
RUN_NAME
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(rfe_pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)'rfe_skl_cols.txt')
mlflow.log_artifact('rfe_skl_idx.txt')
mlflow.log_artifact(
mlflow.log_params(model_sklearn.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 193.34it/s]
2024/10/17 14:32:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run rfe_skl_feature_selection at: http://127.0.0.1:5000/#/experiments/1/runs/e55206caeb1549e4aa0d98343d5c1d4d.
2024/10/17 14:32:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.
mlextend
from mlxtend.feature_selection import SequentialFeatureSelector
#from sklearn.feature_selection import SequentialFeatureSelector
= SequentialFeatureSelector(RandomForestRegressor(n_estimators=3),
sfs =3,
k_features=True,
forward=False, # True to drop selected features
floating='neg_mean_absolute_error',
scoring=2)
cv
sfs.fit(X_train_sklearn,y_train)
= X_train_sklearn.loc[:, sfs.k_feature_names_]
selected_features_sfs selected_features_sfs
num__geo_lon | quantile__geo_lat | spline__area_sp_3 | |
---|---|---|---|
0 | -0.449742 | 0.766257 | 1.826008e-06 |
1 | 1.433673 | 0.297142 | 1.310449e-06 |
2 | 0.047222 | 0.732330 | 6.098363e-07 |
3 | -0.722477 | 0.148789 | 1.144942e-06 |
4 | 1.125819 | 0.985937 | 4.240047e-06 |
... | ... | ... | ... |
410770 | 0.355014 | 0.788393 | 1.401454e-06 |
410771 | 0.392697 | 0.494062 | 8.202272e-07 |
410772 | -0.688830 | 0.131352 | 6.098363e-07 |
410773 | -0.804077 | 0.193143 | 1.004843e-05 |
410774 | -1.101815 | 0.908036 | 3.903343e-06 |
410775 rows × 3 columns
= list(sfs.k_feature_idx_)
rfe_sfs_idx
rfe_sfs_idx= list(sfs.k_feature_names_)
rfe_sfs_col rfe_sfs_col
['num__geo_lon', 'quantile__geo_lat', 'spline__area_sp_3']
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
= plot_sfs(sfs.get_metric_dict(), kind='std_dev')
fig
'Sequential Forward Selection (w. StdDev)')
plt.title(
plt.grid()
plt.show()
= Pipeline(steps=[
rfe_sfs_pipeline 'preprocessor', preprocessor_sklearn),
('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),
('model', regressor)
(
])
rfe_sfs_pipeline.fit(X_train, y_train)
= rfe_sfs_pipeline.predict(X_test)
predictions_sfs
= {}
metrics "mae"] = mean_absolute_error(y_test, predictions_sfs)
metrics["mape"] = mean_absolute_percentage_error(y_test, predictions_sfs)
metrics["mse"] = mean_squared_error(y_test, predictions_sfs)
metrics[
metrics= mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
experiment_id = 'rfe_sfs_feature_selection'
RUN_NAME
with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:
# получаем уникальный идентификатор запуска эксперимента
= run.info.run_id
run_id
mlflow.sklearn.log_model(rfe_sfs_pipeline, ="models",
artifact_path=signature,
signature=input_example,
input_example=req_file
pip_requirements
)
mlflow.log_metrics(metrics)'rfe_skl_cols.txt')
mlflow.log_artifact('rfe_skl_idx.txt')
mlflow.log_artifact(
mlflow.log_params(model_sklearn.get_params())
= mlflow.get_run(run_id)
run assert (run.info.status =='FINISHED')
Можно совмещать признаки, выбранные по sfs и sbs: брать их объединение или пересечение. Можно комбинировать с признаками, выделенными разными подходами - целое поле для исследований
HYPERPARAMS
Gridsearch
from sklearn.model_selection import GridSearchCV
= {
param_grid 'model__depth': [1,3,5]
}
= GridSearchCV(rfe_sfs_pipeline, param_grid, cv=2, scoring='neg_mean_absolute_error')
gs
gs.fit(X_train, y_train)print("Лучшие гиперпараметры:", gs.best_params_)
= Pipeline(steps=[
gs_pipeline 'preprocessor', preprocessor_sklearn),
('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),
('model', CatBoostRegressor(depth=5))
(
])
# Проведем стандартную проверку на тестовом множестве и залогируем run
Вместо GridSearch можно использовать RandomSearch
Optuna
import optuna
def objective(trial):
# предлагаем гиперпараметры
= trial.suggest_int('depth', 1, 10)
depth = trial.suggest_float('learning_rate', 0.001, 0.1)
learning_rate
# создаём и обучаем модель
= Pipeline(steps=[
opt_pipeline 'preprocessor', preprocessor_sklearn),
('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),
('model', CatBoostRegressor(depth=depth, learning_rate=learning_rate, verbose=0))
(
])
opt_pipeline.fit(X_train, y_train)
# предсказываем и вычисляем RMSE
= opt_pipeline.predict(X_test)
preds = mean_absolute_error(y_test, preds)
mae
return mae
= optuna.create_study(direction='minimize')
study =10)
study.optimize(objective, n_trials
# выводим результаты
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
= Pipeline(steps=[
opt_pipeline 'preprocessor', preprocessor_sklearn),
('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),
('model', CatBoostRegressor(depth=3, learning_rate=0.02789))
(
])
# Проведем стандартную проверку на тестовом множестве и залогируем run
Выбираем лучшую модель. Обучаем ее на всей выборке (а не только на train-части). Далее будем деплоить именно её