Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
iis-project/research/research.ipynb

670 KiB

Исследование и настройка предсказательной модели для цен подержанных автомобилях

Блокнот использует файл аугментированных данных датасета о подержанных автомобилях, создаваемый блокнотом eda/cars_eda.py. См. ниже параметры блокнота для papermill.

#XXX: разделить блокнот штук на 5
from typing import Optional
data_aug_pickle_path: Optional[str] = None
# Полный путь к файлу (pickle) для сохранения очищенного датасета. Если не установлен, используется `data/<data_aug_pickle_relpath>`.
data_aug_pickle_relpath: str = 'cars.aug.pickle'
# Путь к файлу (pickle) для сохранения очищенного датасета относительно директории данных `data`. Игнорируется, если установлен data_aug_pickle_path.

#model_global_comment_path: Optional[str] = None
## Полный путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью. Если не установлен, используется `research/<comment_relpath>`.
#model_comment_relpath: str = 'comment.txt'
## Путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью относительно директории `research`. Игнорируется, если установлен comment_path.

mlflow_tracking_server_uri: str = 'http://localhost:5000'
# URL tracking-сервера MLFlow.
mlflow_registry_uri: Optional[str] = None
# URL сервера registry MLFlow (если не указан, используется `mlflow_tracking_server_uri`).

mlflow_do_log: bool = False
# Записывать ли прогоны (runs) в MLFlow.
mlflow_experiment_id: Optional[str] = None
# ID эксперимента MLFlow, имеет приоритет над `mlflow_experiment_name`.
mlflow_experiment_name: Optional[str] = 'Current price predicion for used cars'
# Имя эксперимента MLFlow (ниже приоритетом, чем `mlflow_experiment_id`).
mlflow_root_run_name: str = 'Models'
# Имя корневого прогона MLFlow (остальные прогоны будут созданы блокнотом внутри этого, как nested)
from collections.abc import Collection, Sequence
import os
import pathlib
import pickle
import sys
import matplotlib
import mlflow
import mlflow.models
import mlflow.sklearn
import mlxtend.feature_selection
import mlxtend.plotting
import optuna
import optuna.samplers
import sklearn.compose
import sklearn.ensemble
import sklearn.metrics
import sklearn.model_selection
import sklearn.pipeline
import sklearn.preprocessing
BASE_PATH = pathlib.Path('..')
CODE_PATH = BASE_PATH
sys.path.insert(0, str(CODE_PATH.resolve()))
from iis_project.mlxtend_utils.feature_selection import SEQUENTIAL_FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE
from iis_project.sklearn_utils import filter_params
from iis_project.sklearn_utils.compose import COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE
from iis_project.sklearn_utils.ensemble import RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE
from iis_project.sklearn_utils.pandas import pandas_dataframe_from_transformed_artifacts
from iis_project.sklearn_utils.preprocessing import STANDARD_SCALER_PARAMS_COMMON_EXCLUDE
MODEL_INOUT_EXAMPLE_SIZE = 0x10
mlflow.set_tracking_uri(mlflow_tracking_server_uri)
if mlflow_registry_uri is not None:
    mlflow.set_registry_uri(mlflow_registry_uri)
if mlflow_do_log:
    mlflow_experiment = mlflow.set_experiment(experiment_name=mlflow_experiment_name, experiment_id=mlflow_experiment_id)
    mlflow_root_run_id = None  # изменяется позже
2025/11/02 01:54:17 INFO mlflow.tracking.fluent: Experiment with name 'Current price predicion for used cars' does not exist. Creating a new experiment.
DATA_PATH = (
    pathlib.Path(os.path.dirname(data_aug_pickle_path))
    if data_aug_pickle_path is not None
    else (BASE_PATH / 'data')
)
def build_sequential_feature_selector(*args, **kwargs):
    return mlxtend.feature_selection.SequentialFeatureSelector(*args, **kwargs)

def plot_sequential_feature_selection(feature_selector, *args_rest, **kwargs):
    metric_dict = feature_selector.get_metric_dict()
    return mlxtend.plotting.plot_sequential_feature_selection(metric_dict, *args_rest, **kwargs)

Загрузка и обзор данных

with open(
    (
        data_aug_pickle_path
        if data_aug_pickle_path is not None
        else (DATA_PATH / data_aug_pickle_relpath)
    ),
    'rb',
) as input_file:
    df_orig = pickle.load(input_file)

Обзор датасета:

len(df_orig)
299
df_orig.info()
<class 'pandas.core.frame.DataFrame'>
Index: 299 entries, 0 to 300
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   car_name             299 non-null    object  
 1   year                 299 non-null    int64   
 2   selling_price        299 non-null    float64 
 3   present_price        299 non-null    float64 
 4   driven_kms           299 non-null    int64   
 5   fuel_type            299 non-null    category
 6   selling_type         299 non-null    category
 7   transmission         299 non-null    category
 8   owner                299 non-null    category
 9   age                  299 non-null    float64 
 10  present_price_ratio  299 non-null    float64 
 11  log_selling_price    299 non-null    float64 
 12  log_present_price    299 non-null    float64 
 13  log_driven_kms       299 non-null    float64 
 14  log_age              299 non-null    float64 
dtypes: category(4), float64(8), int64(2), object(1)
memory usage: 29.3+ KB
df_orig.head(0x10)
car_name year selling_price present_price driven_kms fuel_type selling_type transmission owner age present_price_ratio log_selling_price log_present_price log_driven_kms log_age
0 ritz 2014 5.59 3.35 27000 petrol dealer manual 0 5.0 0.599284 0.747412 0.525045 4.431364 0.698970
1 sx4 2013 9.54 4.75 43000 diesel dealer manual 0 6.0 0.497904 0.979548 0.676694 4.633468 0.778151
2 ciaz 2017 9.85 7.25 6900 petrol dealer manual 0 2.0 0.736041 0.993436 0.860338 3.838849 0.301030
3 wagon r 2011 4.15 2.85 5200 petrol dealer manual 0 8.0 0.686747 0.618048 0.454845 3.716003 0.903090
4 swift 2014 6.87 4.60 42450 diesel dealer manual 0 5.0 0.669578 0.836957 0.662758 4.627878 0.698970
5 vitara brezza 2018 9.83 9.25 2071 diesel dealer manual 0 1.0 0.940997 0.992554 0.966142 3.316180 0.000000
6 ciaz 2015 8.12 6.75 18796 petrol dealer manual 0 4.0 0.831281 0.909556 0.829304 4.274065 0.602060
7 s cross 2015 8.61 6.50 33429 diesel dealer manual 0 4.0 0.754936 0.935003 0.812913 4.524123 0.602060
8 ciaz 2016 8.89 8.75 20273 diesel dealer manual 0 3.0 0.984252 0.948902 0.942008 4.306918 0.477121
9 ciaz 2015 8.92 7.45 42367 diesel dealer manual 0 4.0 0.835202 0.950365 0.872156 4.627028 0.602060
10 alto 800 2017 3.60 2.85 2135 petrol dealer manual 0 2.0 0.791667 0.556303 0.454845 3.329398 0.301030
11 ciaz 2015 10.38 6.85 51000 diesel dealer manual 0 4.0 0.659923 1.016197 0.835691 4.707570 0.602060
12 ciaz 2015 9.94 7.50 15000 petrol dealer automatic 0 4.0 0.754527 0.997386 0.875061 4.176091 0.602060
13 ertiga 2015 7.71 6.10 26000 petrol dealer manual 0 4.0 0.791180 0.887054 0.785330 4.414973 0.602060
14 dzire 2009 7.21 2.25 77427 petrol dealer manual 0 10.0 0.312067 0.857935 0.352183 4.888892 1.000000
15 ertiga 2016 10.79 7.75 43000 diesel dealer manual 0 3.0 0.718258 1.033021 0.889302 4.633468 0.477121

Разделение датасета на выборки

Выделение признаков и целевых переменных:

feature_columns = (
    'selling_price',
    'driven_kms',
    'fuel_type',
    'selling_type',
    'transmission',
    #'owner',
    'age',
)

target_columns = (
    'present_price',
)
features_to_scale_to_standard_columns = (
    'selling_price',
    'driven_kms',
    'age',
)
assert all(
    (col in df_orig.select_dtypes(('number',)).columns)
    for col in features_to_scale_to_standard_columns
)

features_to_encode_wrt_target_columns = (
    'fuel_type',
    'selling_type',
    'transmission',
    #'owner',
)
assert all(
    (col in df_orig.select_dtypes(('category', 'object')).columns)
    for col in features_to_encode_wrt_target_columns
)
df_orig_features = df_orig[list(feature_columns)]
df_target = df_orig[list(target_columns)]

Разделение на обучающую и тестовую выборки:

DF_TEST_PORTION = 0.25
df_orig_features_train, df_orig_features_test, df_target_train, df_target_test = (
    sklearn.model_selection.train_test_split(
        df_orig_features, df_target, test_size=DF_TEST_PORTION, random_state=0x7AE6,
    )
)

Размеры обучающей и тестовой выборки соответственно:

tuple(map(len, (df_target_train, df_target_test)))
(224, 75)

Модели

# XXX: один файл requirements для всех моделей
MODEL_PIP_REQUIREMENTS_PATH = BASE_PATH / 'requirements' / 'requirements-isolated-research-model.txt'

Сигнатура модели для MLFlow:

mlflow_model_signature = mlflow.models.infer_signature(model_input=df_orig_features, model_output=df_target)
mlflow_model_signature
D:\studying\university\projects\sem_03_iis\mpei-iis-project\.venv\Lib\site-packages\mlflow\types\utils.py:452: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values <https://www.mlflow.org/docs/latest/models.html#handling-integers-with-missing-values>`_ for more details.
  warnings.warn(
inputs: 
  ['selling_price': double (required), 'driven_kms': long (required), 'fuel_type': string (required), 'selling_type': string (required), 'transmission': string (required), 'age': double (required)]
outputs: 
  ['present_price': double (required)]
params: 
  None
def build_features_scaler_standard():
    return sklearn.preprocessing.StandardScaler()
#def build_categorical_features_encoder_onehot():
#    return sklearn.preprocessing.OneHotEncoder()

def build_categorical_features_encoder_target(*, random_state=None):
    return sklearn.preprocessing.TargetEncoder(
        target_type='continuous', smooth='auto', shuffle=True, random_state=random_state,
    )

Регрессор — небольшой случайный лес, цель — минимизация квадрата ошибки предсказания:

def build_regressor(n_estimators, *, max_depth=None, max_features='sqrt', random_state=None):
    return sklearn.ensemble.RandomForestRegressor(
        n_estimators, criterion='squared_error',
        max_depth=max_depth, max_features=max_features,
        random_state=random_state,
    )

def build_regressor_baseline(*, random_state=None):
    return build_regressor(10, max_depth=8, max_features='sqrt')
def score_predictions(target_test, target_test_predicted):
    return {
        'mse': sklearn.metrics.mean_squared_error(target_test, target_test_predicted),
        'mae': sklearn.metrics.mean_absolute_error(target_test, target_test_predicted),
        'mape': sklearn.metrics.mean_absolute_percentage_error(target_test, target_test_predicted),
    }
# использует глобальные переменные mlflow_do_log, mlflow_experiment, mlflow_root_run_name
def mlflow_log_model(
    model,
    model_params,
    metrics,
    *,
    nested_run_name,
    model_signature=None,
    input_example=None,
    pip_requirements=None,
    #global_comment_file_path=None,
    extra_logs_handler=None,
):
    global mlflow_root_run_id
    if not mlflow_do_log:
        return
    experiment_id = mlflow_experiment.experiment_id
    start_run_root_kwargs_extra = {}
    if mlflow_root_run_id is not None:
        start_run_root_kwargs_extra['run_id'] = mlflow_root_run_id
    else:
        start_run_root_kwargs_extra['run_name'] = mlflow_root_run_name
    with mlflow.start_run(experiment_id=experiment_id, **start_run_root_kwargs_extra) as root_run:
        if root_run.info.status not in ('RUNNING',):
            raise RuntimeError('Cannot get the root run to run')
        if mlflow_root_run_id is None:
            mlflow_root_run_id = root_run.info.run_id
        # важно одновременно использовать nested=True и parent_run_id=...:
        with mlflow.start_run(experiment_id=experiment_id, run_name=nested_run_name, nested=True, parent_run_id=mlflow_root_run_id):
            if isinstance(pip_requirements, pathlib.PurePath):
                pip_requirements = str(pip_requirements)
            _ = mlflow.sklearn.log_model(
                model,
                'model',
                signature=model_signature,
                input_example=input_example,
                pip_requirements=pip_requirements,
            )
            if model_params is not None:
                _ = mlflow.log_params(model_params)
            if metrics is not None:
                _ = mlflow.log_metrics(metrics)
            #if (global_comment_file_path is not None) and global_comment_file_path.exists():
            #    mlflow.log_artifact(str(global_comment_file_path))
            if extra_logs_handler is not None:
                if callable(extra_logs_handler) and (not isinstance(extra_logs_handler, Collection)):
                    extra_logs_handler = (extra_logs_handler,)
                for extr_logs_handler_fn in extra_logs_handler:
                    extr_logs_handler_fn(mlflow)

Baseline модель

Пайплайн предобработки признаков:

preprocess_transformer = sklearn.compose.ColumnTransformer(
    [
        ('scale_to_standard', build_features_scaler_standard(), features_to_scale_to_standard_columns),
        (
            #'encode_categoricals_one_hot',
            'encode_categoricals_wrt_target',
            #build_categorical_features_encoder_onehot(),
            build_categorical_features_encoder_target(random_state=0x2ED6),
            features_to_encode_wrt_target_columns,
        ),
    ],
    remainder='drop',
)
regressor = build_regressor_baseline(random_state=0x016B)
regressor
RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None

Составной пайплайн:

pipeline = sklearn.pipeline.Pipeline([
    ('preprocess', preprocess_transformer),
    ('regress', regressor),
])
pipeline
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('scale_to_standard',
                                                  StandardScaler(),
                                                  ('selling_price',
                                                   'driven_kms', 'age')),
                                                 ('encode_categoricals_wrt_target',
                                                  TargetEncoder(random_state=11990,
                                                                target_type='continuous'),
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('regress',
                 RandomForestRegressor(max_depth=8, max_features='sqrt',
                                       n_estimators=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('preprocess', ...), ('regress', ...)]
transform_input  None
memory  None
verbose  False
Parameters
transformers  [('scale_to_standard', ...), ('encode_categoricals_wrt_target', ...)]
remainder  'drop'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  True
force_int_remainder_cols  'deprecated'
('selling_price', 'driven_kms', 'age')
Parameters
copy  True
with_mean  True
with_std  True
('fuel_type', 'selling_type', 'transmission')
Parameters
categories  'auto'
target_type  'continuous'
smooth  'auto'
cv  5
shuffle  True
random_state  11990
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (
            False,
            {
                **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE},
                'scale_to_standard': True,
                'encode_categorical_wrt_target': True,
            },
        ),
        'regress': (False, True),
    },
    exclude={
        'preprocess': {'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE},
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params
{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 8,
 'regress__max_features': 'sqrt',
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 10,
 'regress__oob_score': False,
 'regress__random_state': None}

Обучение модели:

_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])

Оценка качества:

target_test_predicted = pipeline.predict(df_orig_features_test)

Метрики качества (MAPE, а также MSE, MAE):

metrics = score_predictions(df_target_test, target_test_predicted)
metrics
{'mse': 1.1769122812432413,
 'mae': 0.7433282022345273,
 'mape': 0.3469466962984192}
mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics={k: float(v) for k, v in metrics.items()},
    nested_run_name='Baseline model',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #     else (BASE_PATH / 'research' / model_comment_relpath)
    #),
)
{"model_id":"9ebfedda037646158f6e4acd2cbab0e5","version_major":2,"version_minor":0}
🏃 View run Baseline model at: http://localhost:5000/#/experiments/1/runs/76affaba12a24ee68eb979ae373eb70a
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1

Модель с дополнительными признаками

Пайплайн предобработки признаков:

features_to_extend_as_polynomial = ('selling_price', 'driven_kms')
features_to_extend_as_spline = ('age',)
def build_preprocess_augmenting_transformer():
    assert set(features_to_extend_as_polynomial) <= {*features_to_scale_to_standard_columns}
    assert set(features_to_extend_as_spline) <= {*features_to_scale_to_standard_columns}
    return sklearn.compose.ColumnTransformer(
        [
            (
                'extend_features_as_polynomial',
                sklearn.pipeline.Pipeline([
                    (
                        'extend_features',
                        sklearn.preprocessing.PolynomialFeatures(2, include_bias=False),
                    ),
                    ('scale_to_standard', build_features_scaler_standard()),
                ]),
                features_to_extend_as_polynomial,
            ),
            (
                'extend_features_as_spline',
                sklearn.preprocessing.SplineTransformer(
                    4, knots='quantile', extrapolation='constant', include_bias=False,
                ),
                features_to_extend_as_spline,
            ),
            (
                'scale_to_standard',
                build_features_scaler_standard(),
                tuple(filter(lambda f: f not in features_to_extend_as_polynomial, features_to_scale_to_standard_columns)),
            ),
            (
                'encode_categoricals_wrt_target',
                build_categorical_features_encoder_target(random_state=0x2ED6),
                features_to_encode_wrt_target_columns,
            ),
        ],
        remainder='drop',
    )
PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE = {
    **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE},
    'extend_features_as_polynomial': {
        'extend_features': True,
        'scale_to_standard': True,
    },
    'extend_features_as_spline': True,
    'scale_to_standard': True,
    'encode_categorical_wrt_target': True,
}
PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE = {
    'extend_features_as_polynomial': {
        'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE,
    },
    'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE,
}
preprocess_transformer = build_preprocess_augmenting_transformer()
preprocess_transformer
ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                 Pipeline(steps=[('extend_features',
                                                  PolynomialFeatures(include_bias=False)),
                                                 ('scale_to_standard',
                                                  StandardScaler())]),
                                 ('selling_price', 'driven_kms')),
                                ('extend_features_as_spline',
                                 SplineTransformer(include_bias=False,
                                                   knots='quantile',
                                                   n_knots=4),
                                 ('age',)),
                                ('scale_to_standard', StandardScaler(),
                                 ('age',)),
                                ('encode_categoricals_wrt_target',
                                 TargetEncoder(random_state=11990,
                                               target_type='continuous'),
                                 ('fuel_type', 'selling_type',
                                  'transmission'))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
transformers  [('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
remainder  'drop'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  True
force_int_remainder_cols  'deprecated'
('selling_price', 'driven_kms')
Parameters
degree  2
interaction_only  False
include_bias  False
order  'C'
Parameters
copy  True
with_mean  True
with_std  True
('age',)
Parameters
n_knots  4
degree  3
knots  'quantile'
extrapolation  'constant'
include_bias  False
order  'C'
sparse_output  False
('age',)
Parameters
copy  True
with_mean  True
with_std  True
('fuel_type', 'selling_type', 'transmission')
Parameters
categories  'auto'
target_type  'continuous'
smooth  'auto'
cv  5
shuffle  True
random_state  11990

Демонстрация предобработки данных:

preprocess_transformer_tmp = build_preprocess_augmenting_transformer()
df_augd_features_matrix_train = preprocess_transformer_tmp.fit_transform(df_orig_features_train, df_target_train.iloc[:, 0])
df_augd_features_train = pandas_dataframe_from_transformed_artifacts(df_augd_features_matrix_train, preprocess_transformer_tmp)
del preprocess_transformer_tmp

Обзор предобработанного датасета:

df_augd_features_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 14 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   extend_features_as_polynomial__selling_price             224 non-null    float64
 1   extend_features_as_polynomial__driven_kms                224 non-null    float64
 2   extend_features_as_polynomial__selling_price^2           224 non-null    float64
 3   extend_features_as_polynomial__selling_price driven_kms  224 non-null    float64
 4   extend_features_as_polynomial__driven_kms^2              224 non-null    float64
 5   extend_features_as_spline__age_sp_0                      224 non-null    float64
 6   extend_features_as_spline__age_sp_1                      224 non-null    float64
 7   extend_features_as_spline__age_sp_2                      224 non-null    float64
 8   extend_features_as_spline__age_sp_3                      224 non-null    float64
 9   extend_features_as_spline__age_sp_4                      224 non-null    float64
 10  scale_to_standard__age                                   224 non-null    float64
 11  encode_categoricals_wrt_target__fuel_type                224 non-null    float64
 12  encode_categoricals_wrt_target__selling_type             224 non-null    float64
 13  encode_categoricals_wrt_target__transmission             224 non-null    float64
dtypes: float64(14)
memory usage: 24.6 KB
df_augd_features_train.head(0x8)
extend_features_as_polynomial__selling_price extend_features_as_polynomial__driven_kms extend_features_as_polynomial__selling_price^2 extend_features_as_polynomial__selling_price driven_kms extend_features_as_polynomial__driven_kms^2 extend_features_as_spline__age_sp_0 extend_features_as_spline__age_sp_1 extend_features_as_spline__age_sp_2 extend_features_as_spline__age_sp_3 extend_features_as_spline__age_sp_4 scale_to_standard__age encode_categoricals_wrt_target__fuel_type encode_categoricals_wrt_target__selling_type encode_categoricals_wrt_target__transmission
0 -0.104244 -0.059337 -0.160142 -0.184156 -0.213392 0.000000 0.000000 0.284444 0.614343 0.099879 0.983159 3.418066 6.723044 4.251590
1 0.524405 -0.930984 0.023111 -0.341051 -0.467047 0.049383 0.528395 0.417778 0.004444 0.000000 -1.141223 9.374655 6.400821 3.750236
2 -0.364071 -0.699614 -0.204196 -0.411821 -0.427250 0.006173 0.303549 0.654722 0.035556 0.000000 -0.787159 3.313404 7.018116 4.015122
3 -0.686652 -0.942552 -0.233103 -0.493887 -0.468514 0.006173 0.303549 0.654722 0.035556 0.000000 -0.787159 3.532072 0.673151 4.202766
4 -0.291407 0.090899 -0.193742 -0.236248 -0.141138 0.000000 0.100000 0.780000 0.120000 0.000000 -0.433096 4.968111 7.161109 4.059384
5 -0.747205 -0.236874 -0.235345 -0.474524 -0.287960 0.000000 0.000000 0.190556 0.640202 0.164742 1.337222 3.080685 0.697119 3.750236
6 0.026771 1.112782 -0.130900 0.246412 0.572931 0.000000 0.000000 0.000000 0.227273 0.606061 3.815667 3.532072 6.675406 4.202766
7 -0.180210 -0.066162 -0.174939 -0.219328 -0.216475 0.000000 0.100000 0.780000 0.120000 0.000000 -0.433096 3.284326 7.161109 4.059384
regressor = build_regressor_baseline(random_state=0x3AEF)
regressor
RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None

Составной пайплайн:

pipeline = sklearn.pipeline.Pipeline([
    ('preprocess', preprocess_transformer),
    ('regress', regressor),
])
pipeline
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                                  Pipeline(steps=[('extend_features',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scale_to_standard',
                                                                   StandardScaler())]),
                                                  ('selling_price',
                                                   'driven_kms')),
                                                 ('extend_features_as_spline',
                                                  SplineTransformer(include_bias=False,
                                                                    knots='quantile',
                                                                    n_knots=4),
                                                  ('age',)),
                                                 ('scale_to_standard',
                                                  StandardScaler(), ('age',)),
                                                 ('encode_categoricals_wrt_target',
                                                  TargetEncoder(random_state=11990,
                                                                target_type='continuous'),
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('regress',
                 RandomForestRegressor(max_depth=8, max_features='sqrt',
                                       n_estimators=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('preprocess', ...), ('regress', ...)]
transform_input  None
memory  None
verbose  False
Parameters
transformers  [('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
remainder  'drop'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  True
force_int_remainder_cols  'deprecated'
('selling_price', 'driven_kms')
Parameters
degree  2
interaction_only  False
include_bias  False
order  'C'
Parameters
copy  True
with_mean  True
with_std  True
('age',)
Parameters
n_knots  4
degree  3
knots  'quantile'
extrapolation  'constant'
include_bias  False
order  'C'
sparse_output  False
('age',)
Parameters
copy  True
with_mean  True
with_std  True
('fuel_type', 'selling_type', 'transmission')
Parameters
categories  'auto'
target_type  'continuous'
smooth  'auto'
cv  5
shuffle  True
random_state  11990
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()),
        'regress': (False, True),
    },
    exclude={
        'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(),
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params
{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__extend_features_as_spline': SplineTransformer(include_bias=False, knots='quantile', n_knots=4),
 'preprocess__extend_features_as_polynomial__extend_features': PolynomialFeatures(include_bias=False),
 'preprocess__extend_features_as_polynomial__extend_features__degree': 2,
 'preprocess__extend_features_as_polynomial__extend_features__include_bias': False,
 'preprocess__extend_features_as_polynomial__extend_features__interaction_only': False,
 'preprocess__extend_features_as_polynomial__extend_features__order': 'C',
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_mean': True,
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_std': True,
 'preprocess__extend_features_as_spline__degree': 3,
 'preprocess__extend_features_as_spline__extrapolation': 'constant',
 'preprocess__extend_features_as_spline__include_bias': False,
 'preprocess__extend_features_as_spline__knots': 'quantile',
 'preprocess__extend_features_as_spline__n_knots': 4,
 'preprocess__extend_features_as_spline__order': 'C',
 'preprocess__extend_features_as_spline__sparse_output': False,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 8,
 'regress__max_features': 'sqrt',
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 10,
 'regress__oob_score': False,
 'regress__random_state': None}

Обучение модели:

_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])

Оценка качества:

target_test_predicted = pipeline.predict(df_orig_features_test)

Метрики качества (MAPE, а также MSE, MAE):

metrics = score_predictions(df_target_test, target_test_predicted)
metrics
{'mse': 1.5006829920671902,
 'mae': 0.7582020656775502,
 'mape': 0.30794862210624835}
mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics={k: float(v) for k, v in metrics.items()},
    nested_run_name='Model with engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #    else (BASE_PATH / 'research' / model_comment_relpath)
    #),
)
{"model_id":"5821a1adbbe242a882fed4dd765843c8","version_major":2,"version_minor":0}
🏃 View run Model with engineered features at: http://localhost:5000/#/experiments/1/runs/4c056d60749f459196e55711971a0525
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1

Модель с дополнительными и отфильтрованными признаками

def build_selected_columns_info_for_mlflow(names=None, indices=None):
    info = {}
    if names is not None:
        info['names'] = names
    if indices is not None:
        info['indices'] = indices
    return info

def build_extra_logs_handler_selected_columns(names=None, indices=None):
    def extra_log(mlf):
        if any((v is not None) for v in (names, indices)):
            info = build_selected_columns_info_for_mlflow(names=names, indices=indices)
            mlf.log_dict(info, 'selected_columns_info.json')
    return extra_log
def build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector, *, take_names=True, take_indices=True):
    return build_selected_columns_info_for_mlflow(
        names=(feature_selector.k_feature_names_ if take_names else None),
        indices=(tuple(feature_selector.k_feature_idx_) if take_indices else None),
    )

def build_extra_logs_handler_selected_columns_from_sequential_feature_selector(feature_selector):
    def extra_log(mlf):
        info = build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector)
        mlf.log_dict(info, 'selected_columns_info.json')
    return extra_log
regressor = build_regressor_baseline(random_state=0x8EDD)
regressor
RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None

Выбор признаков среди дополненного набора по минимизации MAPE:

len(df_augd_features_train.columns)
14
FILTERED_FEATURES_NUM = (4, 8)
def build_feature_selector(*, verbose=0):
    return build_sequential_feature_selector(
        regressor, k_features=FILTERED_FEATURES_NUM, forward=True, floating=True, cv=4, scoring='neg_mean_absolute_percentage_error',
        verbose=verbose,
    )
FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE = {
    **{k: True for k in SEQUENTIAL_FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE},
    'estimator': False,
}
FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE = ()  # TODO: ай-яй-яй
feature_selector = build_feature_selector(verbose=1)
feature_selector
SequentialFeatureSelector(cv=4,
                          estimator=RandomForestRegressor(max_depth=8,
                                                          max_features='sqrt',
                                                          n_estimators=10),
                          floating=True, k_features=(4, 8),
                          scoring='neg_mean_absolute_percentage_error',
                          verbose=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
estimator  RandomForestR...estimators=10)
k_features  (4, ...)
forward  True
floating  True
verbose  1
scoring  'neg_mean_absolute_percentage_error'
cv  4
n_jobs  1
pre_dispatch  '2*n_jobs'
clone_estimator  True
fixed_features  None
feature_groups  None
RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
_ = feature_selector.fit(df_augd_features_train, df_target_train.iloc[:, 0])
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.6s finished
Features: 1/8[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.5s finished
Features: 2/8[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s finished
Features: 3/8[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
Features: 4/8[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s finished
Features: 5/8[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
Features: 6/8[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s finished
Features: 5/8[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
Features: 6/8[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
Features: 7/8[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
Features: 7/8[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s finished
Features: 8/8

Выбранные признаки (имена и индексы):

build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector)
{'names': ('extend_features_as_polynomial__selling_price',
  'extend_features_as_polynomial__selling_price^2',
  'extend_features_as_spline__age_sp_1',
  'extend_features_as_spline__age_sp_2',
  'scale_to_standard__age'),
 'indices': (0, 2, 6, 7, 10)}

MAPE в зависимости от количества выбранных признаков (указан регион выбора, ограниченный FILTERED_FEATURES_NUM):

fig, ax = plot_sequential_feature_selection(feature_selector, kind='std_dev')
ax.grid(True)
if isinstance(FILTERED_FEATURES_NUM, Sequence):
    _ = ax.axvspan(min(FILTERED_FEATURES_NUM), max(FILTERED_FEATURES_NUM), color=matplotlib.colormaps.get_cmap('tab10')(6), alpha=0.15)
# хотелось бы поставить верхнюю границу `len(df_augd_features_train.columns)`, но SequentialFeatureSelector до неё не досчитывает-то
_ = ax.set_xlim((1, (max(FILTERED_FEATURES_NUM) if isinstance(FILTERED_FEATURES_NUM, Sequence) else FILTERED_FEATURES_NUM)))
_ = ax.set_ylim((None, 0.))

Составной пайплайн:

pipeline = sklearn.pipeline.Pipeline([
    ('preprocess', build_preprocess_augmenting_transformer()),
    ('select_features', feature_selector),
    ('regress', regressor),
])
pipeline
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                                  Pipeline(steps=[('extend_features',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scale_to_standard',
                                                                   StandardScaler())]),
                                                  ('selling_price',
                                                   'driven_kms')),
                                                 ('extend_features_as_spline',
                                                  SplineTransformer(include_bias=False,
                                                                    knots='quantile',
                                                                    n_knots=4),
                                                  ('age',)),
                                                 ('s...
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('select_features',
                 SequentialFeatureSelector(cv=4,
                                           estimator=RandomForestRegressor(max_depth=8,
                                                                           max_features='sqrt',
                                                                           n_estimators=10),
                                           floating=True, k_features=(4, 8),
                                           scoring='neg_mean_absolute_percentage_error',
                                           verbose=1)),
                ('regress',
                 RandomForestRegressor(max_depth=8, max_features='sqrt',
                                       n_estimators=10))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('preprocess', ...), ('select_features', ...), ...]
transform_input  None
memory  None
verbose  False
Parameters
transformers  [('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
remainder  'drop'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  True
force_int_remainder_cols  'deprecated'
('selling_price', 'driven_kms')
Parameters
degree  2
interaction_only  False
include_bias  False
order  'C'
Parameters
copy  True
with_mean  True
with_std  True
('age',)
Parameters
n_knots  4
degree  3
knots  'quantile'
extrapolation  'constant'
include_bias  False
order  'C'
sparse_output  False
('age',)
Parameters
copy  True
with_mean  True
with_std  True
('fuel_type', 'selling_type', 'transmission')
Parameters
categories  'auto'
target_type  'continuous'
smooth  'auto'
cv  5
shuffle  True
random_state  11990
Parameters
estimator  RandomForestR...estimators=10)
k_features  (4, ...)
forward  True
floating  True
verbose  1
scoring  'neg_mean_absolute_percentage_error'
cv  4
n_jobs  1
pre_dispatch  '2*n_jobs'
clone_estimator  True
fixed_features  None
feature_groups  None
RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()),
        'select_features': (False, FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE.copy()),
        'regress': (False, True),
    },
    exclude={
        'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(),
        'select_features': FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE,
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params
{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__extend_features_as_spline': SplineTransformer(include_bias=False, knots='quantile', n_knots=4),
 'preprocess__extend_features_as_polynomial__extend_features': PolynomialFeatures(include_bias=False),
 'preprocess__extend_features_as_polynomial__extend_features__degree': 2,
 'preprocess__extend_features_as_polynomial__extend_features__include_bias': False,
 'preprocess__extend_features_as_polynomial__extend_features__interaction_only': False,
 'preprocess__extend_features_as_polynomial__extend_features__order': 'C',
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_mean': True,
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_std': True,
 'preprocess__extend_features_as_spline__degree': 3,
 'preprocess__extend_features_as_spline__extrapolation': 'constant',
 'preprocess__extend_features_as_spline__include_bias': False,
 'preprocess__extend_features_as_spline__knots': 'quantile',
 'preprocess__extend_features_as_spline__n_knots': 4,
 'preprocess__extend_features_as_spline__order': 'C',
 'preprocess__extend_features_as_spline__sparse_output': False,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'select_features__cv': 4,
 'select_features__feature_groups': None,
 'select_features__fixed_features': None,
 'select_features__floating': True,
 'select_features__forward': True,
 'select_features__k_features': (4, 8),
 'select_features__scoring': 'neg_mean_absolute_percentage_error',
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 8,
 'regress__max_features': 'sqrt',
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 10,
 'regress__oob_score': False,
 'regress__random_state': None}

Обучение модели:

# XXX: SequentialFeatureSelector обучается опять!?
_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])
[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.5s finished
Features: 1/8[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.5s finished
Features: 2/8[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s finished
Features: 3/8[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
Features: 4/8[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s finished
Features: 5/8[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
Features: 6/8[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
Features: 7/8[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s finished
Features: 8/8

Оценка качества:

target_test_predicted = pipeline.predict(df_orig_features_test)

Метрики качества (MAPE, а также MSE, MAE):

metrics = score_predictions(df_target_test, target_test_predicted)
metrics
{'mse': 1.0194872911964548,
 'mae': 0.6263087407494466,
 'mape': 0.20033337884798225}
mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics={k: float(v) for k, v in metrics.items()},
    nested_run_name='Model with filtered engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #    else (BASE_PATH / 'research' / model_comment_relpath)
    #),
    extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),),
)
{"model_id":"15d75fa1d12046c8b197bf0ac21439b9","version_major":2,"version_minor":0}
🏃 View run Model with filtered engineered features at: http://localhost:5000/#/experiments/1/runs/2236e7acb9df4f689ca0b660e216560d
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1

Автоматический подбор гиперпараметров модели

Составной пайплайн:

def build_pipeline(regressor_n_estimators, regressor_max_depth=None, regressor_max_features='sqrt'):
    return sklearn.pipeline.Pipeline([
        ('preprocess', build_preprocess_augmenting_transformer()), 
        ('select_features', build_feature_selector()),
        ('regress', build_regressor(regressor_n_estimators, max_depth=regressor_max_depth, max_features=regressor_max_features)),
    ])

Целевая функция для оптимизатора гиперпараметров (подбирает параметры RandomForestRegressor: n_estimators, max_depth, max_features):

def regressor_hyperparams_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 1, 256, log=True)
    max_depth = trial.suggest_int('max_depth', 1, 16, log=True)
    max_features = trial.suggest_float('max_features', 0.1, 1.)
    # составной пайплайн:
    pipeline = build_pipeline(n_estimators, regressor_max_depth=max_depth, regressor_max_features=max_features)
    # обучение модели:
    _ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])
    # оценка качества:
    target_test_predicted = pipeline.predict(df_orig_features_test)
    # метрика качества (MAPE):
    mape = sklearn.metrics.mean_absolute_percentage_error(df_target_test, target_test_predicted)
    return mape

optuna study:

optuna_sampler = optuna.samplers.TPESampler(seed=0x0A1C)
optuna_study = optuna.create_study(sampler=optuna_sampler, direction='minimize')
optuna_study.optimize(regressor_hyperparams_objective, n_trials=24)
[I 2025-11-02 01:54:34,763] A new study created in memory with name: no-name-a51c5e47-d34c-41d9-a12c-73c911dfc2c9
[I 2025-11-02 01:54:45,860] Trial 0 finished with value: 0.31673042260874146 and parameters: {'n_estimators': 1, 'max_depth': 5, 'max_features': 0.7538601592025193}. Best is trial 0 with value: 0.31673042260874146.
[I 2025-11-02 01:54:55,277] Trial 1 finished with value: 0.2221917240762483 and parameters: {'n_estimators': 243, 'max_depth': 6, 'max_features': 0.8990011000072798}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:07,160] Trial 2 finished with value: 0.9128522772564759 and parameters: {'n_estimators': 1, 'max_depth': 3, 'max_features': 0.3925657054705518}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:15,650] Trial 3 finished with value: 0.36954875874544413 and parameters: {'n_estimators': 3, 'max_depth': 6, 'max_features': 0.15481298252760906}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:22,138] Trial 4 finished with value: 0.44425704345344336 and parameters: {'n_estimators': 6, 'max_depth': 5, 'max_features': 0.1048988611194081}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:29,471] Trial 5 finished with value: 0.23556663756910004 and parameters: {'n_estimators': 10, 'max_depth': 9, 'max_features': 0.19332272517658144}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:36,076] Trial 6 finished with value: 1.8160905200927615 and parameters: {'n_estimators': 100, 'max_depth': 2, 'max_features': 0.302198580450086}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:43,776] Trial 7 finished with value: 0.3732943995516396 and parameters: {'n_estimators': 1, 'max_depth': 4, 'max_features': 0.7063695125561774}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:52,289] Trial 8 finished with value: 0.2593980092715887 and parameters: {'n_estimators': 1, 'max_depth': 6, 'max_features': 0.3967800872812408}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:58,016] Trial 9 finished with value: 0.7822104999528057 and parameters: {'n_estimators': 12, 'max_depth': 2, 'max_features': 0.9922723597006147}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:56:05,063] Trial 10 finished with value: 2.6164201373576716 and parameters: {'n_estimators': 242, 'max_depth': 1, 'max_features': 0.9886866627656377}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:56:13,968] Trial 11 finished with value: 0.20656714196125883 and parameters: {'n_estimators': 27, 'max_depth': 16, 'max_features': 0.6239771558659984}. Best is trial 11 with value: 0.20656714196125883.
[I 2025-11-02 01:56:20,623] Trial 12 finished with value: 0.20542336297369948 and parameters: {'n_estimators': 49, 'max_depth': 16, 'max_features': 0.6557746668259969}. Best is trial 12 with value: 0.20542336297369948.
[I 2025-11-02 01:56:26,355] Trial 13 finished with value: 0.2000663860477421 and parameters: {'n_estimators': 38, 'max_depth': 16, 'max_features': 0.6301192326123578}. Best is trial 13 with value: 0.2000663860477421.
[I 2025-11-02 01:56:35,951] Trial 14 finished with value: 0.19564416684050806 and parameters: {'n_estimators': 42, 'max_depth': 10, 'max_features': 0.5116506936294666}. Best is trial 14 with value: 0.19564416684050806.
[I 2025-11-02 01:56:43,803] Trial 15 finished with value: 0.2002547156962009 and parameters: {'n_estimators': 39, 'max_depth': 10, 'max_features': 0.5149110154288706}. Best is trial 14 with value: 0.19564416684050806.
[I 2025-11-02 01:56:51,726] Trial 16 finished with value: 0.19801054591841608 and parameters: {'n_estimators': 86, 'max_depth': 14, 'max_features': 0.5190201664970586}. Best is trial 14 with value: 0.19564416684050806.
[I 2025-11-02 01:57:00,437] Trial 17 finished with value: 0.19032917786557116 and parameters: {'n_estimators': 118, 'max_depth': 9, 'max_features': 0.5081814551833768}. Best is trial 17 with value: 0.19032917786557116.
[I 2025-11-02 01:57:10,113] Trial 18 finished with value: 0.1813407546593623 and parameters: {'n_estimators': 103, 'max_depth': 8, 'max_features': 0.4209660896030313}. Best is trial 18 with value: 0.1813407546593623.
[I 2025-11-02 01:57:17,883] Trial 19 finished with value: 3.000176786841034 and parameters: {'n_estimators': 126, 'max_depth': 1, 'max_features': 0.38890086577637156}. Best is trial 18 with value: 0.1813407546593623.
[I 2025-11-02 01:57:25,435] Trial 20 finished with value: 0.22680617405659242 and parameters: {'n_estimators': 22, 'max_depth': 8, 'max_features': 0.2882817069063991}. Best is trial 18 with value: 0.1813407546593623.
[I 2025-11-02 01:57:32,849] Trial 21 finished with value: 0.1752331836664575 and parameters: {'n_estimators': 78, 'max_depth': 10, 'max_features': 0.4752873867901817}. Best is trial 21 with value: 0.1752331836664575.
[I 2025-11-02 01:57:39,316] Trial 22 finished with value: 0.18475613206691036 and parameters: {'n_estimators': 129, 'max_depth': 11, 'max_features': 0.4390718757757792}. Best is trial 21 with value: 0.1752331836664575.
[I 2025-11-02 01:57:46,769] Trial 23 finished with value: 0.18901586701156378 and parameters: {'n_estimators': 67, 'max_depth': 11, 'max_features': 0.4285121891249491}. Best is trial 21 with value: 0.1752331836664575.

Количество выполненных trials:

len(optuna_study.trials)
24

Лучшие найдённые гиперпараметры:

repr(optuna_study.best_params)
"{'n_estimators': 78, 'max_depth': 10, 'max_features': 0.4752873867901817}"
regressor_best_params = dict(optuna_study.best_params.items())

Составной пайплайн:

def build_pipeline_optimized_best():
    return build_pipeline(
        regressor_best_params['n_estimators'],
        regressor_max_depth=regressor_best_params['max_depth'],
        regressor_max_features=regressor_best_params['max_features'],
    )
pipeline = build_pipeline_optimized_best()
pipeline
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                                  Pipeline(steps=[('extend_features',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scale_to_standard',
                                                                   StandardScaler())]),
                                                  ('selling_price',
                                                   'driven_kms')),
                                                 ('extend_features_as_spline',
                                                  SplineTransformer(include_bias=False,
                                                                    knots='quantile',
                                                                    n_knots=4),
                                                  ('age',)),
                                                 ('s...
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('select_features',
                 SequentialFeatureSelector(cv=4,
                                           estimator=RandomForestRegressor(max_depth=8,
                                                                           max_features='sqrt',
                                                                           n_estimators=10),
                                           floating=True, k_features=(4, 8),
                                           scoring='neg_mean_absolute_percentage_error')),
                ('regress',
                 RandomForestRegressor(max_depth=10,
                                       max_features=0.4752873867901817,
                                       n_estimators=78))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('preprocess', ...), ('select_features', ...), ...]
transform_input  None
memory  None
verbose  False
Parameters
transformers  [('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
remainder  'drop'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  True
force_int_remainder_cols  'deprecated'
('selling_price', 'driven_kms')
Parameters
degree  2
interaction_only  False
include_bias  False
order  'C'
Parameters
copy  True
with_mean  True
with_std  True
('age',)
Parameters
n_knots  4
degree  3
knots  'quantile'
extrapolation  'constant'
include_bias  False
order  'C'
sparse_output  False
('age',)
Parameters
copy  True
with_mean  True
with_std  True
('fuel_type', 'selling_type', 'transmission')
Parameters
categories  'auto'
target_type  'continuous'
smooth  'auto'
cv  5
shuffle  True
random_state  11990
Parameters
estimator  RandomForestR...estimators=10)
k_features  (4, ...)
forward  True
floating  True
verbose  0
scoring  'neg_mean_absolute_percentage_error'
cv  4
n_jobs  1
pre_dispatch  '2*n_jobs'
clone_estimator  True
fixed_features  None
feature_groups  None
RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
Parameters
n_estimators  78
criterion  'squared_error'
max_depth  10
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  0.4752873867901817
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()),
        'select_features': (False, FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE.copy()),
        'regress': (False, True),
    },
    exclude={
        'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(),
        'select_features': FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE,
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params
{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__extend_features_as_spline': SplineTransformer(include_bias=False, knots='quantile', n_knots=4),
 'preprocess__extend_features_as_polynomial__extend_features': PolynomialFeatures(include_bias=False),
 'preprocess__extend_features_as_polynomial__extend_features__degree': 2,
 'preprocess__extend_features_as_polynomial__extend_features__include_bias': False,
 'preprocess__extend_features_as_polynomial__extend_features__interaction_only': False,
 'preprocess__extend_features_as_polynomial__extend_features__order': 'C',
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_mean': True,
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_std': True,
 'preprocess__extend_features_as_spline__degree': 3,
 'preprocess__extend_features_as_spline__extrapolation': 'constant',
 'preprocess__extend_features_as_spline__include_bias': False,
 'preprocess__extend_features_as_spline__knots': 'quantile',
 'preprocess__extend_features_as_spline__n_knots': 4,
 'preprocess__extend_features_as_spline__order': 'C',
 'preprocess__extend_features_as_spline__sparse_output': False,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'select_features__cv': 4,
 'select_features__feature_groups': None,
 'select_features__fixed_features': None,
 'select_features__floating': True,
 'select_features__forward': True,
 'select_features__k_features': (4, 8),
 'select_features__scoring': 'neg_mean_absolute_percentage_error',
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 10,
 'regress__max_features': 0.4752873867901817,
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 78,
 'regress__oob_score': False,
 'regress__random_state': None}

Обучение модели:

_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])

Оценка качества:

target_test_predicted = pipeline.predict(df_orig_features_test)

Метрики качества (MAPE, а также MSE, MAE):

metrics = score_predictions(df_target_test, target_test_predicted)
metrics
{'mse': 0.9370236080018509,
 'mae': 0.6048078379366015,
 'mape': 0.19721535277529492}
mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics={k: float(v) for k, v in metrics.items()},
    nested_run_name='Optimized model with filtered engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #    else (BASE_PATH / 'research' / model_comment_relpath)
    #),
    extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),),
)
{"model_id":"6f4a84b68c834b93bc62c1982114ddea","version_major":2,"version_minor":0}
🏃 View run Optimized model with filtered engineered features at: http://localhost:5000/#/experiments/1/runs/c8af91a577d24b74adba3348a90b5e69
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1

И в продакшн

Лучшая выбранная модель — с автоматически подобранными гиперпараметрами.

pipeline = build_pipeline_optimized_best()
pipeline
Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                                  Pipeline(steps=[('extend_features',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scale_to_standard',
                                                                   StandardScaler())]),
                                                  ('selling_price',
                                                   'driven_kms')),
                                                 ('extend_features_as_spline',
                                                  SplineTransformer(include_bias=False,
                                                                    knots='quantile',
                                                                    n_knots=4),
                                                  ('age',)),
                                                 ('s...
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('select_features',
                 SequentialFeatureSelector(cv=4,
                                           estimator=RandomForestRegressor(max_depth=8,
                                                                           max_features='sqrt',
                                                                           n_estimators=10),
                                           floating=True, k_features=(4, 8),
                                           scoring='neg_mean_absolute_percentage_error')),
                ('regress',
                 RandomForestRegressor(max_depth=10,
                                       max_features=0.4752873867901817,
                                       n_estimators=78))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
steps  [('preprocess', ...), ('select_features', ...), ...]
transform_input  None
memory  None
verbose  False
Parameters
transformers  [('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
remainder  'drop'
sparse_threshold  0.3
n_jobs  None
transformer_weights  None
verbose  False
verbose_feature_names_out  True
force_int_remainder_cols  'deprecated'
('selling_price', 'driven_kms')
Parameters
degree  2
interaction_only  False
include_bias  False
order  'C'
Parameters
copy  True
with_mean  True
with_std  True
('age',)
Parameters
n_knots  4
degree  3
knots  'quantile'
extrapolation  'constant'
include_bias  False
order  'C'
sparse_output  False
('age',)
Parameters
copy  True
with_mean  True
with_std  True
('fuel_type', 'selling_type', 'transmission')
Parameters
categories  'auto'
target_type  'continuous'
smooth  'auto'
cv  5
shuffle  True
random_state  11990
Parameters
estimator  RandomForestR...estimators=10)
k_features  (4, ...)
forward  True
floating  True
verbose  0
scoring  'neg_mean_absolute_percentage_error'
cv  4
n_jobs  1
pre_dispatch  '2*n_jobs'
clone_estimator  True
fixed_features  None
feature_groups  None
RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)
Parameters
n_estimators  10
criterion  'squared_error'
max_depth  8
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  'sqrt'
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
Parameters
n_estimators  78
criterion  'squared_error'
max_depth  10
min_samples_split  2
min_samples_leaf  1
min_weight_fraction_leaf  0.0
max_features  0.4752873867901817
max_leaf_nodes  None
min_impurity_decrease  0.0
bootstrap  True
oob_score  False
n_jobs  None
random_state  None
verbose  0
warm_start  False
ccp_alpha  0.0
max_samples  None
monotonic_cst  None
model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()),
        'select_features': (False, FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE.copy()),
        'regress': (False, True),
    },
    exclude={
        'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(),
        'select_features': FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE,
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params
{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__extend_features_as_spline': SplineTransformer(include_bias=False, knots='quantile', n_knots=4),
 'preprocess__extend_features_as_polynomial__extend_features': PolynomialFeatures(include_bias=False),
 'preprocess__extend_features_as_polynomial__extend_features__degree': 2,
 'preprocess__extend_features_as_polynomial__extend_features__include_bias': False,
 'preprocess__extend_features_as_polynomial__extend_features__interaction_only': False,
 'preprocess__extend_features_as_polynomial__extend_features__order': 'C',
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_mean': True,
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_std': True,
 'preprocess__extend_features_as_spline__degree': 3,
 'preprocess__extend_features_as_spline__extrapolation': 'constant',
 'preprocess__extend_features_as_spline__include_bias': False,
 'preprocess__extend_features_as_spline__knots': 'quantile',
 'preprocess__extend_features_as_spline__n_knots': 4,
 'preprocess__extend_features_as_spline__order': 'C',
 'preprocess__extend_features_as_spline__sparse_output': False,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'select_features__cv': 4,
 'select_features__feature_groups': None,
 'select_features__fixed_features': None,
 'select_features__floating': True,
 'select_features__forward': True,
 'select_features__k_features': (4, 8),
 'select_features__scoring': 'neg_mean_absolute_percentage_error',
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 10,
 'regress__max_features': 0.4752873867901817,
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 78,
 'regress__oob_score': False,
 'regress__random_state': None}
_ = pipeline.fit(df_orig_features, df_target.iloc[:, 0])
mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics=None,
    nested_run_name='Final model',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #    else (BASE_PATH / 'research' / model_comment_relpath)
    #),
    extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),),
)
{"model_id":"877854c58cbf4e3c959298d0959eea39","version_major":2,"version_minor":0}
🏃 View run Final model at: http://localhost:5000/#/experiments/1/runs/4c7f04ad9ee94237b44f60b6eb14b41e
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1