в блокнот research добавлено логирование Python requirements в MLFlow

2025-11-01 20:09:18 +03:00
--- a/research/research.py
+++ b/research/research.py
@@ -19,6 +19,9 @@
 # %% [markdown]
 # Блокнот использует файл аугментированных данных датасета о подержанных автомобилях, создаваемый блокнотом `eda/cars_eda.py`. См. ниже параметры блокнота для papermill.

+# %%
+#XXX: разделить блокнот штук на 5
+
 # %%
 from typing import Optional

@@ -211,7 +214,8 @@ tuple(map(len, (df_target_train, df_target_test)))
 # ## Модели

 # %%
-#MODEL_PIP_REQUIREMENTS_PATH = BASE_PATH / 'requirements' / 'requirements-isolated-research-model.txt'
+# XXX: один файл requirements для всех моделей
+MODEL_PIP_REQUIREMENTS_PATH = BASE_PATH / 'requirements' / 'requirements-isolated-research-model.txt'

 # %% [markdown]
 # Сигнатура модели для MLFlow:
@@ -286,7 +290,7 @@ def mlflow_log_model(
    nested_run_name,
    model_signature=None,
    input_example=None,
-    #pip_requirements=None,
+    pip_requirements=None,
    comment_file_path=None,
 ):
    global mlflow_root_run_id
@@ -305,12 +309,14 @@ def mlflow_log_model(
            mlflow_root_run_id = root_run.info.run_id
        # важно одновременно использовать nested=True и parent_run_id=...:
        with mlflow.start_run(experiment_id=experiment_id, run_name=nested_run_name, nested=True, parent_run_id=mlflow_root_run_id):
+            if isinstance(pip_requirements, pathlib.PurePath):
+                pip_requirements = str(pip_requirements)
            _ = mlflow.sklearn.log_model(
                model,
                'model',
                signature=model_signature,
                input_example=input_example,
-                #pip_requirements=pip_requirements,
+                pip_requirements=pip_requirements,
            )
            if model_params is not None:
                _ = mlflow.log_params(model_params)
@@ -403,7 +409,7 @@ mlflow_log_model(
    nested_run_name='Baseline model',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
-    #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH),
+    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    comment_file_path=(
        model_comment_path
        if model_comment_path is not None
@@ -556,7 +562,7 @@ mlflow_log_model(
    nested_run_name='Model with engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
-    #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH),
+    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    comment_file_path=(
        model_comment_path
        if model_comment_path is not None
@@ -676,7 +682,7 @@ mlflow_log_model(
    nested_run_name='Model with filtered engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
-    #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH),
+    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    comment_file_path=(
        model_comment_path
        if model_comment_path is not None
@@ -802,7 +808,7 @@ mlflow_log_model(
    nested_run_name='Optimized model with filtered engineered features',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
-    #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH),
+    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    comment_file_path=(
        model_comment_path
        if model_comment_path is not None
@@ -847,7 +853,7 @@ mlflow_log_model(
    nested_run_name='Final model',
    model_signature=mlflow_model_signature,
    input_example=df_orig_features.head(MODEL_INOUT_EXAMPLE_SIZE),
-    #pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH),
+    pip_requirements=MODEL_PIP_REQUIREMENTS_PATH,
    comment_file_path=(
        model_comment_path
        if model_comment_path is not None