Исследование и настройка предсказательной модели для цен подержанных автомобилях

Parameters

	steps	[('preprocess', ...), ('regress', ...)]
	transform_input	None
	memory	None
	verbose	False

preprocess: ColumnTransformer

Parameters

	transformers	[('scale_to_standard', ...), ('encode_categoricals_wrt_target', ...)]
	remainder	'drop'
	sparse_threshold	0.3
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True
	force_int_remainder_cols	'deprecated'

scale_to_standard

('selling_price', 'driven_kms', 'age')

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

encode_categoricals_wrt_target

('fuel_type', 'selling_type', 'transmission')

TargetEncoder

Parameters

	categories	'auto'
	target_type	'continuous'
	smooth	'auto'
	cv	5
	shuffle	True
	random_state	11990

RandomForestRegressor

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (
            False,
            {
                **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE},
                'scale_to_standard': True,
                'encode_categorical_wrt_target': True,
            },
        ),
        'regress': (False, True),
    },
    exclude={
        'preprocess': {'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE},
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params

{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 8,
 'regress__max_features': 'sqrt',
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 10,
 'regress__oob_score': False,
 'regress__random_state': None}

Обучение модели:

_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])

Оценка качества:

target_test_predicted = pipeline.predict(df_orig_features_test)

Метрики качества (MAPE, а также MSE, MAE):

metrics = score_predictions(df_target_test, target_test_predicted)
metrics

{'mse': 1.1769122812432413,
 'mae': 0.7433282022345273,
 'mape': 0.3469466962984192}

mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics={k: float(v) for k, v in metrics.items()},
    nested_run_name='Baseline model',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #     else (BASE_PATH / 'research' / model_comment_relpath)
    #),
)

{"model_id":"9ebfedda037646158f6e4acd2cbab0e5","version_major":2,"version_minor":0}

🏃 View run Baseline model at: http://localhost:5000/#/experiments/1/runs/76affaba12a24ee68eb979ae373eb70a
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1

Модель с дополнительными признаками

Пайплайн предобработки признаков:

features_to_extend_as_polynomial = ('selling_price', 'driven_kms')
features_to_extend_as_spline = ('age',)

def build_preprocess_augmenting_transformer():
    assert set(features_to_extend_as_polynomial) <= {*features_to_scale_to_standard_columns}
    assert set(features_to_extend_as_spline) <= {*features_to_scale_to_standard_columns}
    return sklearn.compose.ColumnTransformer(
        [
            (
                'extend_features_as_polynomial',
                sklearn.pipeline.Pipeline([
                    (
                        'extend_features',
                        sklearn.preprocessing.PolynomialFeatures(2, include_bias=False),
                    ),
                    ('scale_to_standard', build_features_scaler_standard()),
                ]),
                features_to_extend_as_polynomial,
            ),
            (
                'extend_features_as_spline',
                sklearn.preprocessing.SplineTransformer(
                    4, knots='quantile', extrapolation='constant', include_bias=False,
                ),
                features_to_extend_as_spline,
            ),
            (
                'scale_to_standard',
                build_features_scaler_standard(),
                tuple(filter(lambda f: f not in features_to_extend_as_polynomial, features_to_scale_to_standard_columns)),
            ),
            (
                'encode_categoricals_wrt_target',
                build_categorical_features_encoder_target(random_state=0x2ED6),
                features_to_encode_wrt_target_columns,
            ),
        ],
        remainder='drop',
    )

PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE = {
    **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE},
    'extend_features_as_polynomial': {
        'extend_features': True,
        'scale_to_standard': True,
    },
    'extend_features_as_spline': True,
    'scale_to_standard': True,
    'encode_categorical_wrt_target': True,
}
PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE = {
    'extend_features_as_polynomial': {
        'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE,
    },
    'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE,
}

preprocess_transformer = build_preprocess_augmenting_transformer()
preprocess_transformer

ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                 Pipeline(steps=[('extend_features',
                                                  PolynomialFeatures(include_bias=False)),
                                                 ('scale_to_standard',
                                                  StandardScaler())]),
                                 ('selling_price', 'driven_kms')),
                                ('extend_features_as_spline',
                                 SplineTransformer(include_bias=False,
                                                   knots='quantile',
                                                   n_knots=4),
                                 ('age',)),
                                ('scale_to_standard', StandardScaler(),
                                 ('age',)),
                                ('encode_categoricals_wrt_target',
                                 TargetEncoder(random_state=11990,
                                               target_type='continuous'),
                                 ('fuel_type', 'selling_type',
                                  'transmission'))])

ColumnTransformer

?Documentation for ColumnTransformeriNot fitted

Parameters

	transformers	[('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
	remainder	'drop'
	sparse_threshold	0.3
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True
	force_int_remainder_cols	'deprecated'

extend_features_as_polynomial

('selling_price', 'driven_kms')

PolynomialFeatures

Parameters

	degree	2
	interaction_only	False
	include_bias	False
	order	'C'

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

extend_features_as_spline

('age',)

SplineTransformer

Parameters

	n_knots	4
	degree	3
	knots	'quantile'
	extrapolation	'constant'
	include_bias	False
	order	'C'
	sparse_output	False

scale_to_standard

('age',)

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

encode_categoricals_wrt_target

('fuel_type', 'selling_type', 'transmission')

TargetEncoder

Parameters

	categories	'auto'
	target_type	'continuous'
	smooth	'auto'
	cv	5
	shuffle	True
	random_state	11990

Демонстрация предобработки данных:

preprocess_transformer_tmp = build_preprocess_augmenting_transformer()
df_augd_features_matrix_train = preprocess_transformer_tmp.fit_transform(df_orig_features_train, df_target_train.iloc[:, 0])
df_augd_features_train = pandas_dataframe_from_transformed_artifacts(df_augd_features_matrix_train, preprocess_transformer_tmp)
del preprocess_transformer_tmp

Обзор предобработанного датасета:

df_augd_features_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 14 columns):
 #   Column                                                   Non-Null Count  Dtype  
---  ------                                                   --------------  -----  
 0   extend_features_as_polynomial__selling_price             224 non-null    float64
 1   extend_features_as_polynomial__driven_kms                224 non-null    float64
 2   extend_features_as_polynomial__selling_price^2           224 non-null    float64
 3   extend_features_as_polynomial__selling_price driven_kms  224 non-null    float64
 4   extend_features_as_polynomial__driven_kms^2              224 non-null    float64
 5   extend_features_as_spline__age_sp_0                      224 non-null    float64
 6   extend_features_as_spline__age_sp_1                      224 non-null    float64
 7   extend_features_as_spline__age_sp_2                      224 non-null    float64
 8   extend_features_as_spline__age_sp_3                      224 non-null    float64
 9   extend_features_as_spline__age_sp_4                      224 non-null    float64
 10  scale_to_standard__age                                   224 non-null    float64
 11  encode_categoricals_wrt_target__fuel_type                224 non-null    float64
 12  encode_categoricals_wrt_target__selling_type             224 non-null    float64
 13  encode_categoricals_wrt_target__transmission             224 non-null    float64
dtypes: float64(14)
memory usage: 24.6 KB

df_augd_features_train.head(0x8)

	extend_features_as_polynomial__selling_price	extend_features_as_polynomial__driven_kms	extend_features_as_polynomial__selling_price^2	extend_features_as_polynomial__selling_price driven_kms	extend_features_as_polynomial__driven_kms^2	extend_features_as_spline__age_sp_0	extend_features_as_spline__age_sp_1	extend_features_as_spline__age_sp_2	extend_features_as_spline__age_sp_3	extend_features_as_spline__age_sp_4	scale_to_standard__age	encode_categoricals_wrt_target__fuel_type	encode_categoricals_wrt_target__selling_type	encode_categoricals_wrt_target__transmission
0	-0.104244	-0.059337	-0.160142	-0.184156	-0.213392	0.000000	0.000000	0.284444	0.614343	0.099879	0.983159	3.418066	6.723044	4.251590
1	0.524405	-0.930984	0.023111	-0.341051	-0.467047	0.049383	0.528395	0.417778	0.004444	0.000000	-1.141223	9.374655	6.400821	3.750236
2	-0.364071	-0.699614	-0.204196	-0.411821	-0.427250	0.006173	0.303549	0.654722	0.035556	0.000000	-0.787159	3.313404	7.018116	4.015122
3	-0.686652	-0.942552	-0.233103	-0.493887	-0.468514	0.006173	0.303549	0.654722	0.035556	0.000000	-0.787159	3.532072	0.673151	4.202766
4	-0.291407	0.090899	-0.193742	-0.236248	-0.141138	0.000000	0.100000	0.780000	0.120000	0.000000	-0.433096	4.968111	7.161109	4.059384
5	-0.747205	-0.236874	-0.235345	-0.474524	-0.287960	0.000000	0.000000	0.190556	0.640202	0.164742	1.337222	3.080685	0.697119	3.750236
6	0.026771	1.112782	-0.130900	0.246412	0.572931	0.000000	0.000000	0.000000	0.227273	0.606061	3.815667	3.532072	6.675406	4.202766
7	-0.180210	-0.066162	-0.174939	-0.219328	-0.216475	0.000000	0.100000	0.780000	0.120000	0.000000	-0.433096	3.284326	7.161109	4.059384

regressor = build_regressor_baseline(random_state=0x3AEF)
regressor

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)

RandomForestRegressor

?Documentation for RandomForestRegressoriNot fitted

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

Составной пайплайн:

pipeline = sklearn.pipeline.Pipeline([
    ('preprocess', preprocess_transformer),
    ('regress', regressor),
])
pipeline

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                                  Pipeline(steps=[('extend_features',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scale_to_standard',
                                                                   StandardScaler())]),
                                                  ('selling_price',
                                                   'driven_kms')),
                                                 ('extend_features_as_spline',
                                                  SplineTransformer(include_bias=False,
                                                                    knots='quantile',
                                                                    n_knots=4),
                                                  ('age',)),
                                                 ('scale_to_standard',
                                                  StandardScaler(), ('age',)),
                                                 ('encode_categoricals_wrt_target',
                                                  TargetEncoder(random_state=11990,
                                                                target_type='continuous'),
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('regress',
                 RandomForestRegressor(max_depth=8, max_features='sqrt',
                                       n_estimators=10))])

Pipeline

Parameters

	steps	[('preprocess', ...), ('regress', ...)]
	transform_input	None
	memory	None
	verbose	False

preprocess: ColumnTransformer

Parameters

	transformers	[('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
	remainder	'drop'
	sparse_threshold	0.3
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True
	force_int_remainder_cols	'deprecated'

extend_features_as_polynomial

('selling_price', 'driven_kms')

PolynomialFeatures

Parameters

	degree	2
	interaction_only	False
	include_bias	False
	order	'C'

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

extend_features_as_spline

('age',)

SplineTransformer

Parameters

	n_knots	4
	degree	3
	knots	'quantile'
	extrapolation	'constant'
	include_bias	False
	order	'C'
	sparse_output	False

scale_to_standard

('age',)

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

encode_categoricals_wrt_target

('fuel_type', 'selling_type', 'transmission')

TargetEncoder

Parameters

	categories	'auto'
	target_type	'continuous'
	smooth	'auto'
	cv	5
	shuffle	True
	random_state	11990

RandomForestRegressor

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()),
        'regress': (False, True),
    },
    exclude={
        'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(),
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params

{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__extend_features_as_spline': SplineTransformer(include_bias=False, knots='quantile', n_knots=4),
 'preprocess__extend_features_as_polynomial__extend_features': PolynomialFeatures(include_bias=False),
 'preprocess__extend_features_as_polynomial__extend_features__degree': 2,
 'preprocess__extend_features_as_polynomial__extend_features__include_bias': False,
 'preprocess__extend_features_as_polynomial__extend_features__interaction_only': False,
 'preprocess__extend_features_as_polynomial__extend_features__order': 'C',
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_mean': True,
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_std': True,
 'preprocess__extend_features_as_spline__degree': 3,
 'preprocess__extend_features_as_spline__extrapolation': 'constant',
 'preprocess__extend_features_as_spline__include_bias': False,
 'preprocess__extend_features_as_spline__knots': 'quantile',
 'preprocess__extend_features_as_spline__n_knots': 4,
 'preprocess__extend_features_as_spline__order': 'C',
 'preprocess__extend_features_as_spline__sparse_output': False,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 8,
 'regress__max_features': 'sqrt',
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 10,
 'regress__oob_score': False,
 'regress__random_state': None}

Обучение модели:

_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])

Оценка качества:

target_test_predicted = pipeline.predict(df_orig_features_test)

Метрики качества (MAPE, а также MSE, MAE):

metrics = score_predictions(df_target_test, target_test_predicted)
metrics

{'mse': 1.5006829920671902,
 'mae': 0.7582020656775502,
 'mape': 0.30794862210624835}

mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics={k: float(v) for k, v in metrics.items()},
    nested_run_name='Model with engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #    else (BASE_PATH / 'research' / model_comment_relpath)
    #),
)

{"model_id":"5821a1adbbe242a882fed4dd765843c8","version_major":2,"version_minor":0}

🏃 View run Model with engineered features at: http://localhost:5000/#/experiments/1/runs/4c056d60749f459196e55711971a0525
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1

Модель с дополнительными и отфильтрованными признаками

def build_selected_columns_info_for_mlflow(names=None, indices=None):
    info = {}
    if names is not None:
        info['names'] = names
    if indices is not None:
        info['indices'] = indices
    return info

def build_extra_logs_handler_selected_columns(names=None, indices=None):
    def extra_log(mlf):
        if any((v is not None) for v in (names, indices)):
            info = build_selected_columns_info_for_mlflow(names=names, indices=indices)
            mlf.log_dict(info, 'selected_columns_info.json')
    return extra_log

def build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector, *, take_names=True, take_indices=True):
    return build_selected_columns_info_for_mlflow(
        names=(feature_selector.k_feature_names_ if take_names else None),
        indices=(tuple(feature_selector.k_feature_idx_) if take_indices else None),
    )

def build_extra_logs_handler_selected_columns_from_sequential_feature_selector(feature_selector):
    def extra_log(mlf):
        info = build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector)
        mlf.log_dict(info, 'selected_columns_info.json')
    return extra_log

regressor = build_regressor_baseline(random_state=0x8EDD)
regressor

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)

RandomForestRegressor

?Documentation for RandomForestRegressoriNot fitted

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

Выбор признаков среди дополненного набора по минимизации MAPE:

len(df_augd_features_train.columns)

FILTERED_FEATURES_NUM = (4, 8)

def build_feature_selector(*, verbose=0):
    return build_sequential_feature_selector(
        regressor, k_features=FILTERED_FEATURES_NUM, forward=True, floating=True, cv=4, scoring='neg_mean_absolute_percentage_error',
        verbose=verbose,
    )

FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE = {
    **{k: True for k in SEQUENTIAL_FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE},
    'estimator': False,
}
FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE = ()  # TODO: ай-яй-яй

feature_selector = build_feature_selector(verbose=1)
feature_selector

SequentialFeatureSelector(cv=4,
                          estimator=RandomForestRegressor(max_depth=8,
                                                          max_features='sqrt',
                                                          n_estimators=10),
                          floating=True, k_features=(4, 8),
                          scoring='neg_mean_absolute_percentage_error',
                          verbose=1)

SequentialFeatureSelector

iFitted

Parameters

	estimator	RandomForestR...estimators=10)
	k_features	(4, ...)
	forward	True
	floating	True
	verbose	1
	scoring	'neg_mean_absolute_percentage_error'
	cv	4
	n_jobs	1
	pre_dispatch	'2*n_jobs'
	clone_estimator	True
	fixed_features	None
	feature_groups	None

estimator: RandomForestRegressor

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)

RandomForestRegressor

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

_ = feature_selector.fit(df_augd_features_train, df_target_train.iloc[:, 0])

[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.6s finished
Features: 1/8[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.5s finished
Features: 2/8[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s finished
Features: 3/8[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
Features: 4/8[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s finished
Features: 5/8[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
Features: 6/8[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s finished
Features: 5/8[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
Features: 6/8[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
Features: 7/8[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
Features: 7/8[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s finished
Features: 8/8

Выбранные признаки (имена и индексы):

build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector)

{'names': ('extend_features_as_polynomial__selling_price',
  'extend_features_as_polynomial__selling_price^2',
  'extend_features_as_spline__age_sp_1',
  'extend_features_as_spline__age_sp_2',
  'scale_to_standard__age'),
 'indices': (0, 2, 6, 7, 10)}

MAPE в зависимости от количества выбранных признаков (указан регион выбора, ограниченный FILTERED_FEATURES_NUM):

fig, ax = plot_sequential_feature_selection(feature_selector, kind='std_dev')
ax.grid(True)
if isinstance(FILTERED_FEATURES_NUM, Sequence):
    _ = ax.axvspan(min(FILTERED_FEATURES_NUM), max(FILTERED_FEATURES_NUM), color=matplotlib.colormaps.get_cmap('tab10')(6), alpha=0.15)
# хотелось бы поставить верхнюю границу `len(df_augd_features_train.columns)`, но SequentialFeatureSelector до неё не досчитывает-то
_ = ax.set_xlim((1, (max(FILTERED_FEATURES_NUM) if isinstance(FILTERED_FEATURES_NUM, Sequence) else FILTERED_FEATURES_NUM)))
_ = ax.set_ylim((None, 0.))

Составной пайплайн:

pipeline = sklearn.pipeline.Pipeline([
    ('preprocess', build_preprocess_augmenting_transformer()),
    ('select_features', feature_selector),
    ('regress', regressor),
])
pipeline

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                                  Pipeline(steps=[('extend_features',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scale_to_standard',
                                                                   StandardScaler())]),
                                                  ('selling_price',
                                                   'driven_kms')),
                                                 ('extend_features_as_spline',
                                                  SplineTransformer(include_bias=False,
                                                                    knots='quantile',
                                                                    n_knots=4),
                                                  ('age',)),
                                                 ('s...
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('select_features',
                 SequentialFeatureSelector(cv=4,
                                           estimator=RandomForestRegressor(max_depth=8,
                                                                           max_features='sqrt',
                                                                           n_estimators=10),
                                           floating=True, k_features=(4, 8),
                                           scoring='neg_mean_absolute_percentage_error',
                                           verbose=1)),
                ('regress',
                 RandomForestRegressor(max_depth=8, max_features='sqrt',
                                       n_estimators=10))])

Pipeline

Parameters

	steps	[('preprocess', ...), ('select_features', ...), ...]
	transform_input	None
	memory	None
	verbose	False

preprocess: ColumnTransformer

Parameters

	transformers	[('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
	remainder	'drop'
	sparse_threshold	0.3
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True
	force_int_remainder_cols	'deprecated'

extend_features_as_polynomial

('selling_price', 'driven_kms')

PolynomialFeatures

Parameters

	degree	2
	interaction_only	False
	include_bias	False
	order	'C'

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

extend_features_as_spline

('age',)

SplineTransformer

Parameters

	n_knots	4
	degree	3
	knots	'quantile'
	extrapolation	'constant'
	include_bias	False
	order	'C'
	sparse_output	False

scale_to_standard

('age',)

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

encode_categoricals_wrt_target

('fuel_type', 'selling_type', 'transmission')

TargetEncoder

Parameters

	categories	'auto'
	target_type	'continuous'
	smooth	'auto'
	cv	5
	shuffle	True
	random_state	11990

select_features: SequentialFeatureSelector

Parameters

	estimator	RandomForestR...estimators=10)
	k_features	(4, ...)
	forward	True
	floating	True
	verbose	1
	scoring	'neg_mean_absolute_percentage_error'
	cv	4
	n_jobs	1
	pre_dispatch	'2*n_jobs'
	clone_estimator	True
	fixed_features	None
	feature_groups	None

estimator: RandomForestRegressor

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)

RandomForestRegressor

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

RandomForestRegressor

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()),
        'select_features': (False, FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE.copy()),
        'regress': (False, True),
    },
    exclude={
        'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(),
        'select_features': FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE,
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params

{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__extend_features_as_spline': SplineTransformer(include_bias=False, knots='quantile', n_knots=4),
 'preprocess__extend_features_as_polynomial__extend_features': PolynomialFeatures(include_bias=False),
 'preprocess__extend_features_as_polynomial__extend_features__degree': 2,
 'preprocess__extend_features_as_polynomial__extend_features__include_bias': False,
 'preprocess__extend_features_as_polynomial__extend_features__interaction_only': False,
 'preprocess__extend_features_as_polynomial__extend_features__order': 'C',
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_mean': True,
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_std': True,
 'preprocess__extend_features_as_spline__degree': 3,
 'preprocess__extend_features_as_spline__extrapolation': 'constant',
 'preprocess__extend_features_as_spline__include_bias': False,
 'preprocess__extend_features_as_spline__knots': 'quantile',
 'preprocess__extend_features_as_spline__n_knots': 4,
 'preprocess__extend_features_as_spline__order': 'C',
 'preprocess__extend_features_as_spline__sparse_output': False,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'select_features__cv': 4,
 'select_features__feature_groups': None,
 'select_features__fixed_features': None,
 'select_features__floating': True,
 'select_features__forward': True,
 'select_features__k_features': (4, 8),
 'select_features__scoring': 'neg_mean_absolute_percentage_error',
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 8,
 'regress__max_features': 'sqrt',
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 10,
 'regress__oob_score': False,
 'regress__random_state': None}

Обучение модели:

# XXX: SequentialFeatureSelector обучается опять!?
_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])

[Parallel(n_jobs=1)]: Done  14 out of  14 | elapsed:    0.5s finished
Features: 1/8[Parallel(n_jobs=1)]: Done  13 out of  13 | elapsed:    0.5s finished
Features: 2/8[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s finished
Features: 3/8[Parallel(n_jobs=1)]: Done  11 out of  11 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s finished
Features: 4/8[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.1s finished
Features: 5/8[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
Features: 6/8[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s finished
Features: 7/8[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.2s finished
Features: 8/8

Оценка качества:

target_test_predicted = pipeline.predict(df_orig_features_test)

Метрики качества (MAPE, а также MSE, MAE):

metrics = score_predictions(df_target_test, target_test_predicted)
metrics

{'mse': 1.0194872911964548,
 'mae': 0.6263087407494466,
 'mape': 0.20033337884798225}

mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics={k: float(v) for k, v in metrics.items()},
    nested_run_name='Model with filtered engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #    else (BASE_PATH / 'research' / model_comment_relpath)
    #),
    extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),),
)

{"model_id":"15d75fa1d12046c8b197bf0ac21439b9","version_major":2,"version_minor":0}

🏃 View run Model with filtered engineered features at: http://localhost:5000/#/experiments/1/runs/2236e7acb9df4f689ca0b660e216560d
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1

Автоматический подбор гиперпараметров модели

Составной пайплайн:

def build_pipeline(regressor_n_estimators, regressor_max_depth=None, regressor_max_features='sqrt'):
    return sklearn.pipeline.Pipeline([
        ('preprocess', build_preprocess_augmenting_transformer()), 
        ('select_features', build_feature_selector()),
        ('regress', build_regressor(regressor_n_estimators, max_depth=regressor_max_depth, max_features=regressor_max_features)),
    ])

Целевая функция для оптимизатора гиперпараметров (подбирает параметры RandomForestRegressor: n_estimators, max_depth, max_features):

def regressor_hyperparams_objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 1, 256, log=True)
    max_depth = trial.suggest_int('max_depth', 1, 16, log=True)
    max_features = trial.suggest_float('max_features', 0.1, 1.)
    # составной пайплайн:
    pipeline = build_pipeline(n_estimators, regressor_max_depth=max_depth, regressor_max_features=max_features)
    # обучение модели:
    _ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])
    # оценка качества:
    target_test_predicted = pipeline.predict(df_orig_features_test)
    # метрика качества (MAPE):
    mape = sklearn.metrics.mean_absolute_percentage_error(df_target_test, target_test_predicted)
    return mape

optuna study:

optuna_sampler = optuna.samplers.TPESampler(seed=0x0A1C)
optuna_study = optuna.create_study(sampler=optuna_sampler, direction='minimize')
optuna_study.optimize(regressor_hyperparams_objective, n_trials=24)

[I 2025-11-02 01:54:34,763] A new study created in memory with name: no-name-a51c5e47-d34c-41d9-a12c-73c911dfc2c9
[I 2025-11-02 01:54:45,860] Trial 0 finished with value: 0.31673042260874146 and parameters: {'n_estimators': 1, 'max_depth': 5, 'max_features': 0.7538601592025193}. Best is trial 0 with value: 0.31673042260874146.
[I 2025-11-02 01:54:55,277] Trial 1 finished with value: 0.2221917240762483 and parameters: {'n_estimators': 243, 'max_depth': 6, 'max_features': 0.8990011000072798}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:07,160] Trial 2 finished with value: 0.9128522772564759 and parameters: {'n_estimators': 1, 'max_depth': 3, 'max_features': 0.3925657054705518}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:15,650] Trial 3 finished with value: 0.36954875874544413 and parameters: {'n_estimators': 3, 'max_depth': 6, 'max_features': 0.15481298252760906}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:22,138] Trial 4 finished with value: 0.44425704345344336 and parameters: {'n_estimators': 6, 'max_depth': 5, 'max_features': 0.1048988611194081}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:29,471] Trial 5 finished with value: 0.23556663756910004 and parameters: {'n_estimators': 10, 'max_depth': 9, 'max_features': 0.19332272517658144}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:36,076] Trial 6 finished with value: 1.8160905200927615 and parameters: {'n_estimators': 100, 'max_depth': 2, 'max_features': 0.302198580450086}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:43,776] Trial 7 finished with value: 0.3732943995516396 and parameters: {'n_estimators': 1, 'max_depth': 4, 'max_features': 0.7063695125561774}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:52,289] Trial 8 finished with value: 0.2593980092715887 and parameters: {'n_estimators': 1, 'max_depth': 6, 'max_features': 0.3967800872812408}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:55:58,016] Trial 9 finished with value: 0.7822104999528057 and parameters: {'n_estimators': 12, 'max_depth': 2, 'max_features': 0.9922723597006147}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:56:05,063] Trial 10 finished with value: 2.6164201373576716 and parameters: {'n_estimators': 242, 'max_depth': 1, 'max_features': 0.9886866627656377}. Best is trial 1 with value: 0.2221917240762483.
[I 2025-11-02 01:56:13,968] Trial 11 finished with value: 0.20656714196125883 and parameters: {'n_estimators': 27, 'max_depth': 16, 'max_features': 0.6239771558659984}. Best is trial 11 with value: 0.20656714196125883.
[I 2025-11-02 01:56:20,623] Trial 12 finished with value: 0.20542336297369948 and parameters: {'n_estimators': 49, 'max_depth': 16, 'max_features': 0.6557746668259969}. Best is trial 12 with value: 0.20542336297369948.
[I 2025-11-02 01:56:26,355] Trial 13 finished with value: 0.2000663860477421 and parameters: {'n_estimators': 38, 'max_depth': 16, 'max_features': 0.6301192326123578}. Best is trial 13 with value: 0.2000663860477421.
[I 2025-11-02 01:56:35,951] Trial 14 finished with value: 0.19564416684050806 and parameters: {'n_estimators': 42, 'max_depth': 10, 'max_features': 0.5116506936294666}. Best is trial 14 with value: 0.19564416684050806.
[I 2025-11-02 01:56:43,803] Trial 15 finished with value: 0.2002547156962009 and parameters: {'n_estimators': 39, 'max_depth': 10, 'max_features': 0.5149110154288706}. Best is trial 14 with value: 0.19564416684050806.
[I 2025-11-02 01:56:51,726] Trial 16 finished with value: 0.19801054591841608 and parameters: {'n_estimators': 86, 'max_depth': 14, 'max_features': 0.5190201664970586}. Best is trial 14 with value: 0.19564416684050806.
[I 2025-11-02 01:57:00,437] Trial 17 finished with value: 0.19032917786557116 and parameters: {'n_estimators': 118, 'max_depth': 9, 'max_features': 0.5081814551833768}. Best is trial 17 with value: 0.19032917786557116.
[I 2025-11-02 01:57:10,113] Trial 18 finished with value: 0.1813407546593623 and parameters: {'n_estimators': 103, 'max_depth': 8, 'max_features': 0.4209660896030313}. Best is trial 18 with value: 0.1813407546593623.
[I 2025-11-02 01:57:17,883] Trial 19 finished with value: 3.000176786841034 and parameters: {'n_estimators': 126, 'max_depth': 1, 'max_features': 0.38890086577637156}. Best is trial 18 with value: 0.1813407546593623.
[I 2025-11-02 01:57:25,435] Trial 20 finished with value: 0.22680617405659242 and parameters: {'n_estimators': 22, 'max_depth': 8, 'max_features': 0.2882817069063991}. Best is trial 18 with value: 0.1813407546593623.
[I 2025-11-02 01:57:32,849] Trial 21 finished with value: 0.1752331836664575 and parameters: {'n_estimators': 78, 'max_depth': 10, 'max_features': 0.4752873867901817}. Best is trial 21 with value: 0.1752331836664575.
[I 2025-11-02 01:57:39,316] Trial 22 finished with value: 0.18475613206691036 and parameters: {'n_estimators': 129, 'max_depth': 11, 'max_features': 0.4390718757757792}. Best is trial 21 with value: 0.1752331836664575.
[I 2025-11-02 01:57:46,769] Trial 23 finished with value: 0.18901586701156378 and parameters: {'n_estimators': 67, 'max_depth': 11, 'max_features': 0.4285121891249491}. Best is trial 21 with value: 0.1752331836664575.

Количество выполненных trials:

len(optuna_study.trials)

Лучшие найдённые гиперпараметры:

repr(optuna_study.best_params)

"{'n_estimators': 78, 'max_depth': 10, 'max_features': 0.4752873867901817}"

regressor_best_params = dict(optuna_study.best_params.items())

Составной пайплайн:

def build_pipeline_optimized_best():
    return build_pipeline(
        regressor_best_params['n_estimators'],
        regressor_max_depth=regressor_best_params['max_depth'],
        regressor_max_features=regressor_best_params['max_features'],
    )

pipeline = build_pipeline_optimized_best()
pipeline

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                                  Pipeline(steps=[('extend_features',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scale_to_standard',
                                                                   StandardScaler())]),
                                                  ('selling_price',
                                                   'driven_kms')),
                                                 ('extend_features_as_spline',
                                                  SplineTransformer(include_bias=False,
                                                                    knots='quantile',
                                                                    n_knots=4),
                                                  ('age',)),
                                                 ('s...
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('select_features',
                 SequentialFeatureSelector(cv=4,
                                           estimator=RandomForestRegressor(max_depth=8,
                                                                           max_features='sqrt',
                                                                           n_estimators=10),
                                           floating=True, k_features=(4, 8),
                                           scoring='neg_mean_absolute_percentage_error')),
                ('regress',
                 RandomForestRegressor(max_depth=10,
                                       max_features=0.4752873867901817,
                                       n_estimators=78))])

Pipeline

Parameters

	steps	[('preprocess', ...), ('select_features', ...), ...]
	transform_input	None
	memory	None
	verbose	False

preprocess: ColumnTransformer

Parameters

	transformers	[('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
	remainder	'drop'
	sparse_threshold	0.3
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True
	force_int_remainder_cols	'deprecated'

extend_features_as_polynomial

('selling_price', 'driven_kms')

PolynomialFeatures

Parameters

	degree	2
	interaction_only	False
	include_bias	False
	order	'C'

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

extend_features_as_spline

('age',)

SplineTransformer

Parameters

	n_knots	4
	degree	3
	knots	'quantile'
	extrapolation	'constant'
	include_bias	False
	order	'C'
	sparse_output	False

scale_to_standard

('age',)

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

encode_categoricals_wrt_target

('fuel_type', 'selling_type', 'transmission')

TargetEncoder

Parameters

	categories	'auto'
	target_type	'continuous'
	smooth	'auto'
	cv	5
	shuffle	True
	random_state	11990

select_features: SequentialFeatureSelector

Parameters

	estimator	RandomForestR...estimators=10)
	k_features	(4, ...)
	forward	True
	floating	True
	verbose	0
	scoring	'neg_mean_absolute_percentage_error'
	cv	4
	n_jobs	1
	pre_dispatch	'2*n_jobs'
	clone_estimator	True
	fixed_features	None
	feature_groups	None

estimator: RandomForestRegressor

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)

RandomForestRegressor

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

RandomForestRegressor

Parameters

	n_estimators	78
	criterion	'squared_error'
	max_depth	10
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	0.4752873867901817
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

model_params = filter_params(
    pipeline.get_params(),
    include={
        'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()),
        'select_features': (False, FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE.copy()),
        'regress': (False, True),
    },
    exclude={
        'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(),
        'select_features': FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE,
        'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE,
    },
)
model_params

{'preprocess__remainder': 'drop',
 'preprocess__sparse_threshold': 0.3,
 'preprocess__transformer_weights': None,
 'preprocess__extend_features_as_spline': SplineTransformer(include_bias=False, knots='quantile', n_knots=4),
 'preprocess__extend_features_as_polynomial__extend_features': PolynomialFeatures(include_bias=False),
 'preprocess__extend_features_as_polynomial__extend_features__degree': 2,
 'preprocess__extend_features_as_polynomial__extend_features__include_bias': False,
 'preprocess__extend_features_as_polynomial__extend_features__interaction_only': False,
 'preprocess__extend_features_as_polynomial__extend_features__order': 'C',
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_mean': True,
 'preprocess__extend_features_as_polynomial__scale_to_standard__with_std': True,
 'preprocess__extend_features_as_spline__degree': 3,
 'preprocess__extend_features_as_spline__extrapolation': 'constant',
 'preprocess__extend_features_as_spline__include_bias': False,
 'preprocess__extend_features_as_spline__knots': 'quantile',
 'preprocess__extend_features_as_spline__n_knots': 4,
 'preprocess__extend_features_as_spline__order': 'C',
 'preprocess__extend_features_as_spline__sparse_output': False,
 'preprocess__scale_to_standard__with_mean': True,
 'preprocess__scale_to_standard__with_std': True,
 'select_features__cv': 4,
 'select_features__feature_groups': None,
 'select_features__fixed_features': None,
 'select_features__floating': True,
 'select_features__forward': True,
 'select_features__k_features': (4, 8),
 'select_features__scoring': 'neg_mean_absolute_percentage_error',
 'regress__bootstrap': True,
 'regress__ccp_alpha': 0.0,
 'regress__criterion': 'squared_error',
 'regress__max_depth': 10,
 'regress__max_features': 0.4752873867901817,
 'regress__max_leaf_nodes': None,
 'regress__max_samples': None,
 'regress__min_impurity_decrease': 0.0,
 'regress__min_samples_leaf': 1,
 'regress__min_samples_split': 2,
 'regress__min_weight_fraction_leaf': 0.0,
 'regress__monotonic_cst': None,
 'regress__n_estimators': 78,
 'regress__oob_score': False,
 'regress__random_state': None}

Обучение модели:

_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0])

Оценка качества:

target_test_predicted = pipeline.predict(df_orig_features_test)

Метрики качества (MAPE, а также MSE, MAE):

metrics = score_predictions(df_target_test, target_test_predicted)
metrics

{'mse': 0.9370236080018509,
 'mae': 0.6048078379366015,
 'mape': 0.19721535277529492}

mlflow_log_model(
    pipeline,
    model_params=model_params,
    metrics={k: float(v) for k, v in metrics.items()},
    nested_run_name='Optimized model with filtered engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    #global_comment_file_path=(
    #    model_comment_path
    #    if model_comment_path is not None
    #    else (BASE_PATH / 'research' / model_comment_relpath)
    #),
    extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),),
)

{"model_id":"6f4a84b68c834b93bc62c1982114ddea","version_major":2,"version_minor":0}

🏃 View run Optimized model with filtered engineered features at: http://localhost:5000/#/experiments/1/runs/c8af91a577d24b74adba3348a90b5e69
🧪 View experiment at: http://localhost:5000/#/experiments/1
🏃 View run Models at: http://localhost:5000/#/experiments/1/runs/4e4a9094cb3c4eed9d4a056a27cadcd9
🧪 View experiment at: http://localhost:5000/#/experiments/1

И в продакшн

Лучшая выбранная модель — с автоматически подобранными гиперпараметрами.

pipeline = build_pipeline_optimized_best()
pipeline

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('extend_features_as_polynomial',
                                                  Pipeline(steps=[('extend_features',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scale_to_standard',
                                                                   StandardScaler())]),
                                                  ('selling_price',
                                                   'driven_kms')),
                                                 ('extend_features_as_spline',
                                                  SplineTransformer(include_bias=False,
                                                                    knots='quantile',
                                                                    n_knots=4),
                                                  ('age',)),
                                                 ('s...
                                                  ('fuel_type', 'selling_type',
                                                   'transmission'))])),
                ('select_features',
                 SequentialFeatureSelector(cv=4,
                                           estimator=RandomForestRegressor(max_depth=8,
                                                                           max_features='sqrt',
                                                                           n_estimators=10),
                                           floating=True, k_features=(4, 8),
                                           scoring='neg_mean_absolute_percentage_error')),
                ('regress',
                 RandomForestRegressor(max_depth=10,
                                       max_features=0.4752873867901817,
                                       n_estimators=78))])

Pipeline

Parameters

	steps	[('preprocess', ...), ('select_features', ...), ...]
	transform_input	None
	memory	None
	verbose	False

preprocess: ColumnTransformer

Parameters

	transformers	[('extend_features_as_polynomial', ...), ('extend_features_as_spline', ...), ...]
	remainder	'drop'
	sparse_threshold	0.3
	n_jobs	None
	transformer_weights	None
	verbose	False
	verbose_feature_names_out	True
	force_int_remainder_cols	'deprecated'

extend_features_as_polynomial

('selling_price', 'driven_kms')

PolynomialFeatures

Parameters

	degree	2
	interaction_only	False
	include_bias	False
	order	'C'

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

extend_features_as_spline

('age',)

SplineTransformer

Parameters

	n_knots	4
	degree	3
	knots	'quantile'
	extrapolation	'constant'
	include_bias	False
	order	'C'
	sparse_output	False

scale_to_standard

('age',)

StandardScaler

Parameters

	copy	True
	with_mean	True
	with_std	True

encode_categoricals_wrt_target

('fuel_type', 'selling_type', 'transmission')

TargetEncoder

Parameters

	categories	'auto'
	target_type	'continuous'
	smooth	'auto'
	cv	5
	shuffle	True
	random_state	11990

select_features: SequentialFeatureSelector

Parameters

	estimator	RandomForestR...estimators=10)
	k_features	(4, ...)
	forward	True
	floating	True
	verbose	0
	scoring	'neg_mean_absolute_percentage_error'
	cv	4
	n_jobs	1
	pre_dispatch	'2*n_jobs'
	clone_estimator	True
	fixed_features	None
	feature_groups	None

estimator: RandomForestRegressor

RandomForestRegressor(max_depth=8, max_features='sqrt', n_estimators=10)

RandomForestRegressor

Parameters

	n_estimators	10
	criterion	'squared_error'
	max_depth	8
	min_samples_split	2
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	'sqrt'
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	True
	oob_score	False
	n_jobs	None
	random_state	None
	verbose	0
	warm_start	False
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

RandomForestRegressor