в блокнот research добавить логирование списков выбранных признаков в MLFlow (через новый коллбек в mlflow_log_model), закомментировать логирование global_comment_file

2025-11-01 21:23:42 +03:00
--- a/research/research.py
+++ b/research/research.py
@@ -31,10 +31,10 @@ data_aug_pickle_path: Optional[str] = None
 data_aug_pickle_relpath: str = 'cars.aug.pickle'
 # Путь к файлу (pickle) для сохранения очищенного датасета относительно директории данных `data`. Игнорируется, если установлен data_aug_pickle_path.

-model_comment_path: Optional[str] = None
-# Полный путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью. Если не установлен, используется `research/<comment_relpath>`.
-model_comment_relpath: str = 'comment.txt'
-# Путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью относительно директории `research`. Игнорируется, если установлен comment_path.
+#model_global_comment_path: Optional[str] = None
+## Полный путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью. Если не установлен, используется `research/<comment_relpath>`.
+#model_comment_relpath: str = 'comment.txt'
+## Путь к текстовому файлу с произвольным комментарием для сохранения в MLFlow как артефакт вместе с моделью относительно директории `research`. Игнорируется, если установлен comment_path.

 mlflow_tracking_server_uri: str = 'http://localhost:5000'
 # URL tracking-сервера MLFlow.
@@ -51,7 +51,7 @@ mlflow_root_run_name: str = 'Models'
 # Имя корневого прогона MLFlow (остальные прогоны будут созданы блокнотом внутри этого, как nested)

 # %%
-from collections.abc import Sequence
+from collections.abc import Collection, Sequence
 import os
 import pathlib
 import pickle
@@ -291,7 +291,8 @@ def mlflow_log_model(
    model_signature=None,
    input_example=None,
    pip_requirements=None,
-    comment_file_path=None,
+    #global_comment_file_path=None,
+    extra_logs_handler=None,
 ):
    global mlflow_root_run_id
    if not mlflow_do_log:
@@ -322,8 +323,13 @@ def mlflow_log_model(
                _ = mlflow.log_params(model_params)
            if metrics is not None:
                _ = mlflow.log_metrics(metrics)
-            if (comment_file_path is not None) and comment_file_path.exists():
-                mlflow.log_artifact(str(comment_file_path))
+            #if (global_comment_file_path is not None) and global_comment_file_path.exists():
+            #    mlflow.log_artifact(str(global_comment_file_path))
+            if extra_logs_handler is not None:
+                if callable(extra_logs_handler) and (not isinstance(extra_logs_handler, Collection)):
+                    extra_logs_handler = (extra_logs_handler,)
+                for extr_logs_handler_fn in extra_logs_handler:
+                    extr_logs_handler_fn(mlflow)


 # %% [markdown]
@@ -410,11 +416,11 @@ mlflow_log_model(
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
-    comment_file_path=(
-        model_comment_path
-        if model_comment_path is not None
-        else (BASE_PATH / 'research' / model_comment_relpath)
-    ),
+    #global_comment_file_path=(
+    #    model_comment_path
+    #    if model_comment_path is not None
+    #     else (BASE_PATH / 'research' / model_comment_relpath)
+    #),
 )

 # %% [markdown]
@@ -563,16 +569,48 @@ mlflow_log_model(
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
-    comment_file_path=(
-        model_comment_path
-        if model_comment_path is not None
-        else (BASE_PATH / 'research' / model_comment_relpath)
-    ),
+    #global_comment_file_path=(
+    #    model_comment_path
+    #    if model_comment_path is not None
+    #    else (BASE_PATH / 'research' / model_comment_relpath)
+    #),
 )

+
 # %% [markdown]
 # ### Модель с дополнительными и отфильтрованными признаками

+# %%
+def build_selected_columns_info_for_mlflow(names=None, indices=None):
+    info = {}
+    if names is not None:
+        info['names'] = names
+    if indices is not None:
+        info['indices'] = indices
+    return info
+
+def build_extra_logs_handler_selected_columns(names=None, indices=None):
+    def extra_log(mlf):
+        if any((v is not None) for v in (names, indices)):
+            info = build_selected_columns_info_for_mlflow(names=names, indices=indices)
+            mlf.log_dict(info, 'selected_columns_info.json')
+    return extra_log
+
+
+# %%
+def build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector, *, take_names=True, take_indices=True):
+    return build_selected_columns_info_for_mlflow(
+        names=(feature_selector.k_feature_names_ if take_names else None),
+        indices=(tuple(feature_selector.k_feature_idx_) if take_indices else None),
+    )
+
+def build_extra_logs_handler_selected_columns_from_sequential_feature_selector(feature_selector):
+    def extra_log(mlf):
+        info = build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector)
+        mlf.log_dict(info, 'selected_columns_info.json')
+    return extra_log
+
+
 # %%
 regressor = build_regressor_baseline(random_state=0x8EDD)
 regressor
@@ -610,10 +648,10 @@ feature_selector
 _ = feature_selector.fit(df_augd_features_train, df_target_train.iloc[:, 0])

 # %% [markdown]
-# Имена выбранных признаков:
+# Выбранные признаки (имена и индексы):

 # %%
-feature_selector.k_feature_names_
+build_selected_columns_info_for_mlflow_from_sequential_feature_selector(feature_selector)

 # %% [markdown]
 # MAPE в зависимости от количества выбранных признаков (указан регион выбора, ограниченный `FILTERED_FEATURES_NUM`):
@@ -683,11 +721,12 @@ mlflow_log_model(
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
-    comment_file_path=(
-        model_comment_path
-        if model_comment_path is not None
-        else (BASE_PATH / 'research' / model_comment_relpath)
-    ),
+    #global_comment_file_path=(
+    #    model_comment_path
+    #    if model_comment_path is not None
+    #    else (BASE_PATH / 'research' / model_comment_relpath)
+    #),
+    extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),),
 )


@@ -809,11 +848,12 @@ mlflow_log_model(
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
-    comment_file_path=(
-        model_comment_path
-        if model_comment_path is not None
-        else (BASE_PATH / 'research' / model_comment_relpath)
-    ),
+    #global_comment_file_path=(
+    #    model_comment_path
+    #    if model_comment_path is not None
+    #    else (BASE_PATH / 'research' / model_comment_relpath)
+    #),
+    extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),),
 )

 # %% [markdown]
@@ -854,11 +894,12 @@ mlflow_log_model(
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
-    comment_file_path=(
-        model_comment_path
-        if model_comment_path is not None
-        else (BASE_PATH / 'research' / model_comment_relpath)
-    ),
+    #global_comment_file_path=(
+    #    model_comment_path
+    #    if model_comment_path is not None
+    #    else (BASE_PATH / 'research' / model_comment_relpath)
+    #),
+    extra_logs_handler=(build_extra_logs_handler_selected_columns_from_sequential_feature_selector(pipeline.named_steps['select_features']),),
 )

 # %%