diff --git a/research/research.py b/research/research.py index c39bb89..8179e28 100644 --- a/research/research.py +++ b/research/research.py @@ -31,10 +31,10 @@ data_aug_pickle_path: Optional[str] = None data_aug_pickle_relpath: str = 'cars.aug.pickle' # Путь к файлу (pickle) для сохранения очищенного датасета относительно директории данных `data`. Игнорируется, если установлен data_aug_pickle_path. -model_comment_path: Optional[str] = None -# Полный путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью. Если не установлен, используется `research/`. -model_comment_relpath: str = 'comment.txt' -# Путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью относительно директории `research`. Игнорируется, если установлен comment_path. +#model_global_comment_path: Optional[str] = None +## Полный путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью. Если не установлен, используется `research/`. +#model_comment_relpath: str = 'comment.txt' +## Путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью относительно директории `research`. Игнорируется, если установлен comment_path. mlflow_tracking_server_uri: str = 'http://localhost:5000' # URL tracking-сервера MLFlow. @@ -51,7 +51,7 @@ mlflow_root_run_name: str = 'Models' # Имя корневого прогона MLFlow (остальные прогоны будут созданы блокнотом внутри этого, как nested) # %% -from collections.abc import Sequence +from collections.abc import Collection, Sequence import os import pathlib import pickle @@ -291,7 +291,8 @@ def mlflow_log_model( model_signature=None, input_example=None, pip_requirements=None, - comment_file_path=None, + #global_comment_file_path=None, + extra_logs_handler=None, ): global mlflow_root_run_id if not mlflow_do_log: @@ -322,8 +323,13 @@ def mlflow_log_model( _ = mlflow.log_params(model_params) if metrics is not None: _ = mlflow.log_metrics(metrics) - if (comment_file_path is not None) and comment_file_path.exists(): - mlflow.log_artifact(str(comment_file_path)) + #if (global_comment_file_path is not None) and global_comment_file_path.exists(): + # mlflow.log_artifact(str(global_comment_file_path)) + if extra_logs_handler is not None: + if callable(extra_logs_handler) and (not isinstance(extra_logs_handler, Collection)): + extra_logs_handler = (extra_logs_handler,) + for extr_logs_handler_fn in extra_logs_handler: + extr_logs_handler_fn(mlflow) # %% [markdown] @@ -410,11 +416,11 @@ mlflow_log_model( model_signature=mlflow_model_signature, input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), pip_requirements=MODEL_PIP_REQUIREMENTS_PATH, - comment_file_path=( - model_comment_path - if model_comment_path is not None - else (BASE_PATH / 'research' / model_comment_relpath) - ), + #global_comment_file_path=( + # model_comment_path + # if model_comment_path is not None + # else (BASE_PATH / 'research' / model_comment_relpath) + #), ) # %% [markdown] @@ -563,16 +569,48 @@ mlflow_log_model( model_signature=mlflow_model_signature, input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), pip_requirements=MODEL_PIP_REQUIREMENTS_PATH, - comment_file_path=( - model_comment_path - if model_comment_path is not None - else (BASE_PATH / 'research' / model_comment_relpath) - ), + #global_comment_file_path=( + # model_comment_path + # if model_comment_path is not None + # else (BASE_PATH / 'research' / model_comment_relpath) + #), ) + # %% [markdown] # ### Модель с дополнительными и отфильтрованными признаками +# %% +def build_selected_columns_info_for_mlflow(names=None, indices=None): + info = {} + if names is not None: + info['names'] = names + if indices is not None: + info['indices'] = indices + return info + +def build_extra_logs_handler_selected_columns(names=None, indices=None): + def extra_log(mlf): + if any((v is not None) for v in (names, indices)): + info = build_selected_columns_info_for_mlflow(names=names, indices=indices) + mlf.log_dict(info, 'selected_columns_info.json') + return extra_log + + +# %% +def build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector, *, take_names=True, take_indices=True): + return build_selected_columns_info_for_mlflow( + names=(feature_selector.k_feature_names_ if take_names else None), + indices=(tuple(feature_selector.k_feature_idx_) if take_indices else None), + ) + +def build_extra_logs_handler_selected_columns_from_sequential_feature_selector(feature_selector): + def extra_log(mlf): + info = build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector) + mlf.log_dict(info, 'selected_columns_info.json') + return extra_log + + # %% regressor = build_regressor_baseline(random_state=0x8EDD) regressor @@ -610,10 +648,10 @@ feature_selector _ = feature_selector.fit(df_augd_features_train, df_target_train.iloc[:, 0]) # %% [markdown] -# Имена выбранных признаков: +# Выбранные признаки (имена и индексы): # %% -feature_selector.k_feature_names_ +build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector) # %% [markdown] # MAPE в зависимости от количества выбранных признаков (указан регион выбора, ограниченный `FILTERED_FEATURES_NUM`): @@ -683,11 +721,12 @@ mlflow_log_model( model_signature=mlflow_model_signature, input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), pip_requirements=MODEL_PIP_REQUIREMENTS_PATH, - comment_file_path=( - model_comment_path - if model_comment_path is not None - else (BASE_PATH / 'research' / model_comment_relpath) - ), + #global_comment_file_path=( + # model_comment_path + # if model_comment_path is not None + # else (BASE_PATH / 'research' / model_comment_relpath) + #), + extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),), ) @@ -809,11 +848,12 @@ mlflow_log_model( model_signature=mlflow_model_signature, input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), pip_requirements=MODEL_PIP_REQUIREMENTS_PATH, - comment_file_path=( - model_comment_path - if model_comment_path is not None - else (BASE_PATH / 'research' / model_comment_relpath) - ), + #global_comment_file_path=( + # model_comment_path + # if model_comment_path is not None + # else (BASE_PATH / 'research' / model_comment_relpath) + #), + extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),), ) # %% [markdown] @@ -854,11 +894,12 @@ mlflow_log_model( model_signature=mlflow_model_signature, input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE), pip_requirements=MODEL_PIP_REQUIREMENTS_PATH, - comment_file_path=( - model_comment_path - if model_comment_path is not None - else (BASE_PATH / 'research' / model_comment_relpath) - ), + #global_comment_file_path=( + # model_comment_path + # if model_comment_path is not None + # else (BASE_PATH / 'research' / model_comment_relpath) + #), + extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),), ) # %%