diff --git a/requirements/requirements-research.txt b/requirements/requirements-research.txt index a7510fc..1527882 100644 --- a/requirements/requirements-research.txt +++ b/requirements/requirements-research.txt @@ -1,3 +1,4 @@ mlflow >=2.16,<2.22 mlxtend ~=0.23.4 +optuna ~=4.5 scikit-learn >=1.7.2,<2 diff --git a/research/research.py b/research/research.py index f6dba6d..9d6741b 100644 --- a/research/research.py +++ b/research/research.py @@ -50,6 +50,8 @@ mlflow_feateng_run_name: str = 'Model with engineered features' # Имя ноговго прогона MLFlow для модели, использующей дополнительные признаки mlflow_feateng_filtered_run_name: str = 'Model with filtered engineered features' # Имя ноговго прогона MLFlow для модели, использующей дополнительные признаки и фильтрацию признаков +mlflow_optimized_feateng_filtered_run_name: str = 'Optimized model with filtered engineered features' +# Имя ноговго прогона MLFlow для модели с оптимизированными гиперпараметрами, использующей дополнительные признаки и фильтрацию признаков # %% from collections.abc import Sequence @@ -65,6 +67,7 @@ import mlflow.models import mlflow.sklearn import mlxtend.feature_selection import mlxtend.plotting +import optuna import sklearn.compose import sklearn.ensemble import sklearn.metrics @@ -257,13 +260,16 @@ def build_categorical_features_encoder_target(*, random_state=None): # Регрессор — небольшой случайный лес, цель — минимизация квадрата ошибки предсказания: # %% -def build_regressor(*, random_state=None): +def build_regressor(n_estimators, *, max_depth=None, max_features='sqrt', random_state=None): return sklearn.ensemble.RandomForestRegressor( - 10, criterion='squared_error', - max_depth=8, max_features='sqrt', + n_estimators, criterion='squared_error', + max_depth=max_depth, max_features=max_features, random_state=random_state, ) +def build_regressor_baseline(*, random_state=None): + return build_regressor(16, max_depth=8, max_features='sqrt') + # %% def score_predictions(target_test, target_test_predicted): @@ -327,7 +333,7 @@ preprocess_transformer = sklearn.compose.ColumnTransformer( ) # %% -regressor = build_regressor(random_state=0x016B) +regressor = build_regressor_baseline(random_state=0x016B) regressor # %% [markdown] @@ -408,7 +414,7 @@ features_to_extend_as_spline = ('age',) # %% -def build_preprocess_transformer(): +def build_preprocess_augmenting_transformer(): assert set(features_to_extend_as_polynomial) <= {*features_to_scale_to_standard_columns} assert set(features_to_extend_as_spline) <= {*features_to_scale_to_standard_columns} return sklearn.compose.ColumnTransformer( @@ -445,14 +451,34 @@ def build_preprocess_transformer(): remainder='drop', ) -preprocess_transformer = build_preprocess_transformer() + +# %% +PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE = { + **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE}, + 'extend_features_as_polynomial': { + 'extend_features': True, + 'scale_to_standard': True, + }, + 'extend_features_as_spline': True, + 'scale_to_standard': True, + 'encode_categorical_wrt_target': True, +} +PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE = { + 'extend_features_as_polynomial': { + 'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE, + }, + 'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE, +} + +# %% +preprocess_transformer = build_preprocess_augmenting_transformer() preprocess_transformer # %% [markdown] # Демонстрация предобработки данных: # %% -preprocess_transformer_tmp = build_preprocess_transformer() +preprocess_transformer_tmp = build_preprocess_augmenting_transformer() df_augd_features_matrix_train = preprocess_transformer_tmp.fit_transform(df_orig_features_train, df_target_train.iloc[:, 0]) df_augd_features_train = pandas_dataframe_from_transformed_artifacts(df_augd_features_matrix_train, preprocess_transformer_tmp) del preprocess_transformer_tmp @@ -467,7 +493,7 @@ df_augd_features_train.info() df_augd_features_train.head(0x8) # %% -regressor = build_regressor(random_state=0x3AEF) +regressor = build_regressor_baseline(random_state=0x3AEF) regressor # %% [markdown] @@ -484,28 +510,11 @@ pipeline model_params = filter_params( pipeline.get_params(), include={ - 'preprocess': ( - False, - { - **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE}, - 'extend_features_as_polynomial': { - 'extend_features': True, - 'scale_to_standard': True, - }, - 'extend_features_as_spline': True, - 'scale_to_standard': True, - 'encode_categorical_wrt_target': True, - }, - ), + 'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()), 'regress': (False, True), }, exclude={ - 'preprocess': { - 'extend_features_as_polynomial': { - 'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE, - }, - 'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE, - }, + 'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(), 'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE, }, ) @@ -550,7 +559,7 @@ mlflow_log_model( # ### Модель с дополнительными и отфильтрованными признаками # %% -regressor = build_regressor(random_state=0x8EDD) +regressor = build_regressor_baseline(random_state=0x8EDD) regressor # %% [markdown] @@ -562,11 +571,24 @@ len(df_augd_features_train.columns) # %% FILTERED_FEATURES_NUM = (4, 8) + # %% -feature_selector = build_sequential_feature_selector( - regressor, k_features=FILTERED_FEATURES_NUM, forward=True, floating=True, cv=4, scoring='neg_mean_absolute_percentage_error', - verbose=1, -) +def build_feature_selector(*, verbose=0): + return build_sequential_feature_selector( + regressor, k_features=FILTERED_FEATURES_NUM, forward=True, floating=True, cv=4, scoring='neg_mean_absolute_percentage_error', + verbose=verbose, + ) + + +# %% +FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE = { + **{k: True for k in SEQUENTIAL_FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE}, + 'estimator': False, +} +FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE = () # TODO: ай-яй-яй + +# %% +feature_selector = build_feature_selector(verbose=1) feature_selector # %% @@ -595,7 +617,7 @@ _ = ax.set_ylim((None, 0.)) # %% pipeline = sklearn.pipeline.Pipeline([ - ('preprocess', preprocess_transformer), + ('preprocess', build_preprocess_augmenting_transformer()), ('select_features', feature_selector), ('regress', regressor), ]) @@ -605,36 +627,13 @@ pipeline model_params = filter_params( pipeline.get_params(), include={ - 'preprocess': ( - False, - { - **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE}, - 'extend_features_as_polynomial': { - 'extend_features': True, - 'scale_to_standard': True, - }, - 'extend_features_as_spline': True, - 'scale_to_standard': True, - 'encode_categorical_wrt_target': True, - }, - ), - 'select_features': ( - False, - { - **{k: True for k in SEQUENTIAL_FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE}, - 'estimator': False, - }, - ), + 'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()), + 'select_features': (False, FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE.copy()), 'regress': (False, True), }, exclude={ - 'preprocess': { - 'extend_features_as_polynomial': { - 'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE, - }, - 'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE, - }, - 'select_features': (), # TODO: ай-яй-яй + 'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(), + 'select_features': FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE, 'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE, }, ) @@ -676,4 +675,129 @@ mlflow_log_model( ), ) + +# %% [markdown] +# ### Автоматический подбор гиперпараметров модели + +# %% [markdown] +# Составной пайплайн: + +# %% +def build_pipeline(regressor_n_estimators, regressor_max_depth=None, regressor_max_features='sqrt'): + return sklearn.pipeline.Pipeline([ + ('preprocess', build_preprocess_augmenting_transformer()), + ('select_features', build_feature_selector()), + ('regress', build_regressor(regressor_n_estimators, max_depth=regressor_max_depth, max_features=regressor_max_features)), + ]) + + +# %% [markdown] +# Целевая функция для оптимизатора гиперпараметров (подбирает параметры `RandomForestRegressor`: `n_estimators`, `max_depth`, `max_features`): + +# %% +def regressor_hyperparams_objective(trial): + n_estimators = trial.suggest_int('n_estimators', 1, 256, log=True) + max_depth = trial.suggest_int('max_depth', 1, 16, log=True) + max_features = trial.suggest_float('max_features', 0.1, 1.) + # составной пайплайн: + pipeline = build_pipeline(n_estimators, regressor_max_depth=max_depth, regressor_max_features=max_features) + # обучение модели: + _ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0]) + # оценка качества: + target_test_predicted = pipeline.predict(df_orig_features_test) + # метрика качества (MAPE): + mape = sklearn.metrics.mean_absolute_percentage_error(df_target_test, target_test_predicted) + return mape + + +# %% [markdown] +# optuna study: + +# %% +optuna_study = optuna.create_study(direction='minimize') +optuna_study.optimize(regressor_hyperparams_objective, n_trials=64, timeout=120.) + +# %% [markdown] +# Количество выполненных trials: + +# %% +len(optuna_study.trials) + +# %% [markdown] +# Лучшие найдённые гиперпараметры (недетерминированы, один из результатов записан явно): + +# %% +optuna_study.best_params + +# %% +regressor_best_params = { + #'n_estimators': 51, + 'n_estimators': 50, + 'max_depth': 11, + #'max_features': 0.44655290756636146, + 'max_features': 0.45, +} + +# %% [markdown] +# Составной пайплайн: + +# %% +pipeline = build_pipeline( + regressor_best_params['n_estimators'], + regressor_max_depth=regressor_best_params['max_depth'], + regressor_max_features=regressor_best_params['max_features'], +) +pipeline + +# %% +model_params = filter_params( + pipeline.get_params(), + include={ + 'preprocess': (False, PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_INCLUDE.copy()), + 'select_features': (False, FEATURE_SELECTOR_PARAMS_COMMON_INCLUDE.copy()), + 'regress': (False, True), + }, + exclude={ + 'preprocess': PREPROCESS_AUGMENTING_TRANSFORMER_PARAMS_COMMON_EXCLUDE.copy(), + 'select_features': FEATURE_SELECTOR_PARAMS_COMMON_EXCLUDE, + 'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE, + }, +) +model_params + +# %% [markdown] +# Обучение модели: + +# %% +_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0]) + +# %% [markdown] +# Оценка качества: + +# %% +target_test_predicted = pipeline.predict(df_orig_features_test) + +# %% [markdown] +# Метрики качества (MAPE, а также MSE, MAE): + +# %% +metrics = score_predictions(df_target_test, target_test_predicted) +metrics + +# %% +mlflow_log_model( + pipeline, + model_params=model_params, + metrics={k: float(v) for k, v in metrics.items()}, + run_name=mlflow_optimized_feateng_filtered_run_name, + model_signature=mlflow_model_signature, + input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), + #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH), + comment_file_path=( + model_comment_path + if model_comment_path is not None + else (BASE_PATH / 'research' / model_comment_relpath) + ), +) + # %%