diff --git a/iis_project/sklearn_utils/pandas.py b/iis_project/sklearn_utils/pandas.py new file mode 100644 index 0000000..01c1244 --- /dev/null +++ b/iis_project/sklearn_utils/pandas.py @@ -0,0 +1,5 @@ +from pandas import DataFrame + + +def pandas_dataframe_from_transformed_artifacts(matrix, transformer) -> DataFrame: + return DataFrame(matrix, columns=transformer.get_feature_names_out()) diff --git a/iis_project/sklearn_utils/pipeline.py b/iis_project/sklearn_utils/pipeline.py deleted file mode 100644 index 3b3adae..0000000 --- a/iis_project/sklearn_utils/pipeline.py +++ /dev/null @@ -1 +0,0 @@ -PIPELINE_PARAMS_COMMON_INCLUDE = ['transform_input'] diff --git a/research/research.py b/research/research.py index b87bf99..e3a68e0 100644 --- a/research/research.py +++ b/research/research.py @@ -39,13 +39,15 @@ mlflow_registry_uri: Optional[str] = None # URL сервера registry MLFlow (если не указан, используется `mlflow_tracking_server_uri`). mlflow_do_log: bool = False -# Записывать ли прогон (run) в MLFlow; если True, при каждом исполнении блокнота создаётся новый прогон с именем `mlflow_run_name`. +# Записывать ли прогоны (runs) в MLFlow. mlflow_experiment_id: Optional[str] = None # ID эксперимента MLFlow, имеет приоритет над `mlflow_experiment_name`. mlflow_experiment_name: Optional[str] = 'Current price predicion for used cars' # Имя эксперимента MLFlow (ниже приоритетом, чем `mlflow_experiment_id`). -mlflow_run_name: str = 'Baseline model' -# Имя нового прогона MLFlow (используется для создания нового прогона, если `mlflow_do_log` установлен в True). +mlflow_baseline_run_name: str = 'Baseline model' +# Имя ноговго прогона MLFlow для baseline модели. +mlflow_feateng_run_name: str = 'Model with engineered features' +# Имя ноговго прогона MLFlow для модели, использующей дополнительные признаки # %% import os @@ -75,7 +77,7 @@ sys.path.insert(0, str(CODE_PATH.resolve())) from iis_project.sklearn_utils import filter_params from iis_project.sklearn_utils.compose import COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE from iis_project.sklearn_utils.ensemble import RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE -from iis_project.sklearn_utils.pipeline import PIPELINE_PARAMS_COMMON_INCLUDE +from iis_project.sklearn_utils.pandas import pandas_dataframe_from_transformed_artifacts from iis_project.sklearn_utils.preprocessing import STANDARD_SCALER_PARAMS_COMMON_EXCLUDE # %% @@ -112,23 +114,17 @@ with open( df_orig = pickle.load(input_file) # %% [markdown] -# Обзор строк датасета: - -# %% -df_orig.head(0x10) - -# %% [markdown] -# Размер датасета: +# Обзор датасета: # %% len(df_orig) -# %% [markdown] -# Количество непустых значений и тип каждого столбца: - # %% df_orig.info() +# %% +df_orig.head(0x10) + # %% [markdown] # ## Разделение датасета на выборки @@ -196,7 +192,7 @@ df_orig_features_train, df_orig_features_test, df_target_train, df_target_test = tuple(map(len, (df_target_train, df_target_test))) # %% [markdown] -# ## Создание пайплайнов обработки признаков и обучения модели +# ## Модели # %% #MODEL_PIP_REQUIREMENTS_PATH = BASE_PATH / 'requirements' / 'requirements-isolated-research-model.txt' @@ -208,6 +204,7 @@ tuple(map(len, (df_target_train, df_target_test))) mlflow_model_signature = mlflow.models.infer_signature(model_input=df_orig_features, model_output=df_target) mlflow_model_signature + # %% [raw] vscode={"languageId": "raw"} # input_schema = mlflow.types.schema.Schema([ # mlflow.types.schema.ColSpec("double", "selling_price"), @@ -224,33 +221,230 @@ mlflow_model_signature # # mlflow_model_signature = mlflow.models.ModelSignature(inputs=input_schema, outputs=output_schema) +# %% +def build_features_scaler_standard(): + return sklearn.preprocessing.StandardScaler() + + +# %% +#def build_categorical_features_encoder_onehot(): +# return sklearn.preprocessing.OneHotEncoder() + +def build_categorical_features_encoder_target(*, random_state=None): + return sklearn.preprocessing.TargetEncoder( + target_type='continuous', smooth='auto', shuffle=True, random_state=random_state, + ) + + +# %% [markdown] +# Регрессор — небольшой случайный лес, цель — минимизация квадрата ошибки предсказания: + +# %% +def build_regressor(*, random_state=None): + return sklearn.ensemble.RandomForestRegressor( + 10, criterion='squared_error', max_features='sqrt', random_state=random_state, + ) + + +# %% +def score_predictions(target_test, target_test_predicted): + return { + 'mse': sklearn.metrics.mean_squared_error(target_test, target_test_predicted), + 'mae': sklearn.metrics.mean_absolute_error(target_test, target_test_predicted), + 'mape': sklearn.metrics.mean_absolute_percentage_error(target_test, target_test_predicted), + } + + +# %% +# использует глобальные переменные mlflow_do_log, mlflow_experiment +def mlflow_log_model( + model, + model_params, + metrics, + *, + run_name, + model_signature=None, + input_example=None, + #pip_requirements=None, + comment_file_path=None, +): + if not mlflow_do_log: + return + with mlflow.start_run(experiment_id=mlflow_experiment.experiment_id, run_name=run_name): + _ = mlflow.sklearn.log_model( + model, + 'model', + signature=model_signature, + input_example=input_example, + #pip_requirements=pip_requirements, + ) + if model_params is not None: + _ = mlflow.log_params(model_params) + if metrics is not None: + _ = mlflow.log_metrics(metrics) + if (comment_file_path is not None) and comment_file_path.exists(): + mlflow.log_artifact(str(comment_file_path)) + + +# %% [markdown] +# ### Baseline модель + # %% [markdown] # Пайплайн предобработки признаков: # %% preprocess_transformer = sklearn.compose.ColumnTransformer( [ - ('scale_to_standard', sklearn.preprocessing.StandardScaler(), features_to_scale_to_standard_columns), + ('scale_to_standard', build_features_scaler_standard(), features_to_scale_to_standard_columns), ( #'encode_categoricals_one_hot', 'encode_categoricals_wrt_target', - #sklearn.preprocessing.OneHotEncoder(), - sklearn.preprocessing.TargetEncoder( - target_type='continuous', smooth='auto', shuffle=True, random_state=0x2ED6, - ), + #build_categorical_features_encoder_onehot(), + build_categorical_features_encoder_target(random_state=0x2ED6), features_to_encode_wrt_target_columns, ), ], remainder='drop', ) +# %% +regressor = build_regressor(random_state=0x016B) +regressor + # %% [markdown] -# Регрессор — небольшой случайный лес, цель — минимизация квадрата ошибки предсказания: +# Составной пайплайн: + +# %% +pipeline = sklearn.pipeline.Pipeline([ + ('preprocess', preprocess_transformer), + ('regress', regressor), +]) +pipeline + +# %% +model_params = filter_params( + pipeline.get_params(), + include={ + 'preprocess': ( + False, + { + **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE}, + 'scale_to_standard': True, + 'encode_categorical_wrt_target': True, + }, + ), + 'regress': (False, True), + }, + exclude={ + 'preprocess': {'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE}, + 'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE, + }, +) +model_params + +# %% [markdown] +# Обучение модели: + +# %% +_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0]) + +# %% [markdown] +# Оценка качества: + +# %% +target_test_predicted = pipeline.predict(df_orig_features_test) + +# %% [markdown] +# Метрики качества (MAPE, а также MSE, MAE): # %% -regressor = sklearn.ensemble.RandomForestRegressor( - 10, criterion='squared_error', max_features='sqrt', random_state=0x016B, +metrics = score_predictions(df_target_test, target_test_predicted) +metrics + +# %% +mlflow_log_model( + pipeline, + model_params=model_params, + metrics={k: float(v) for k, v in metrics.items()}, + run_name=mlflow_baseline_run_name, + model_signature=mlflow_model_signature, + input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), + #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH), + comment_file_path=( + model_comment_path + if model_comment_path is not None + else (BASE_PATH / 'research' / model_comment_relpath) + ), +) + +# %% [markdown] +# ### Модель с дополнительными признаками + +# %% [markdown] +# Пайплайн предобработки признаков: + +# %% +features_to_extend_as_polynomial = ('selling_price', 'driven_kms') +features_to_extend_as_spline = ('age',) + +# %% +assert set(features_to_extend_as_polynomial) <= {*features_to_scale_to_standard_columns} +assert set(features_to_extend_as_spline) <= {*features_to_scale_to_standard_columns} + +preprocess_transformer = sklearn.compose.ColumnTransformer( + [ + ( + 'extend_features_as_polynomial', + sklearn.pipeline.Pipeline([ + ( + 'extend_features', + sklearn.preprocessing.PolynomialFeatures(2, include_bias=False), + ), + ('scale_to_standard', build_features_scaler_standard()), + ]), + features_to_extend_as_polynomial, + ), + ( + 'extend_features_as_spline', + sklearn.preprocessing.SplineTransformer( + 4, knots='quantile', extrapolation='constant', include_bias=False, + ), + features_to_extend_as_spline, + ), + ( + 'scale_to_standard', + build_features_scaler_standard(), + tuple(filter(lambda f: f not in features_to_extend_as_polynomial, features_to_scale_to_standard_columns)), + ), + ( + 'encode_categoricals_wrt_target', + build_categorical_features_encoder_target(random_state=0x2ED6), + features_to_encode_wrt_target_columns, + ), + ], + remainder='drop', ) +preprocess_transformer + +# %% [markdown] +# Демонстрация предобработки данных: + +# %% +df_tfd_features_matrix_test = preprocess_transformer.fit_transform(df_orig_features_test, df_target_test.iloc[:, 0]) +df_tfd_features_test = pandas_dataframe_from_transformed_artifacts(df_tfd_features_matrix_test, preprocess_transformer) + +# %% [markdown] +# Обзор предобработанного датасета: + +# %% +df_tfd_features_test.info() + +# %% +df_tfd_features_test.head(0x8) + +# %% +regressor = build_regressor(random_state=0x3AEF) +regressor # %% [markdown] # Составной пайплайн: @@ -266,11 +460,15 @@ pipeline model_params = filter_params( pipeline.get_params(), include={ - **{k: True for k in PIPELINE_PARAMS_COMMON_INCLUDE}, 'preprocess': ( False, { **{k: True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE}, + 'extend_features_as_polynomial': { + 'extend_features': True, + 'scale_to_standard': True, + }, + 'extend_features_as_spline': True, 'scale_to_standard': True, 'encode_categorical_wrt_target': True, }, @@ -278,18 +476,26 @@ model_params = filter_params( 'regress': (False, True), }, exclude={ - 'preprocess': {'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE}, + 'preprocess': { + 'extend_features_as_polynomial': { + 'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE, + }, + 'scale_to_standard': STANDARD_SCALER_PARAMS_COMMON_EXCLUDE, + }, 'regress': RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE, }, ) model_params # %% [markdown] -# ## Baseline модель +# Обучение модели: # %% _ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0]) +# %% [markdown] +# Оценка качества: + # %% target_test_predicted = pipeline.predict(df_orig_features_test) @@ -297,31 +503,23 @@ target_test_predicted = pipeline.predict(df_orig_features_test) # Метрики качества (MAPE, а также MSE, MAE): # %% -metrics = { - 'mse': sklearn.metrics.mean_squared_error(df_target_test, target_test_predicted), - 'mae': sklearn.metrics.mean_absolute_error(df_target_test, target_test_predicted), - 'mape': sklearn.metrics.mean_absolute_percentage_error(df_target_test, target_test_predicted), -} +metrics = score_predictions(df_target_test, target_test_predicted) +metrics # %% -metrics +mlflow_log_model( + pipeline, + model_params=model_params, + metrics={k: float(v) for k, v in metrics.items()}, + run_name=mlflow_feateng_run_name, + model_signature=mlflow_model_signature, + input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), + #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH), + comment_file_path=( + model_comment_path + if model_comment_path is not None + else (BASE_PATH / 'research' / model_comment_relpath) + ), +) # %% -if mlflow_do_log: - with mlflow.start_run(experiment_id=mlflow_experiment.experiment_id, run_name=mlflow_run_name): - _ = mlflow.sklearn.log_model( - pipeline, - 'model', - signature=mlflow_model_signature, - input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), - #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH), - ) - _ = mlflow.log_params(model_params) - _ = mlflow.log_metrics({k: float(v) for k, v in metrics.items()}) - comment_file_path = ( - model_comment_path - if model_comment_path is not None - else (BASE_PATH / 'research' / model_comment_relpath) - ) - if comment_file_path.exists(): - mlflow.log_artifact(str(comment_file_path))