{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import mlflow\n", "\n", "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n", "import numpy\n", "\n", "from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n", "\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import LinearRegression\n", "from catboost import CatBoostRegressor\n", "\n", "from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = df.rename(columns={'price': 'target'})\n", "df = df.drop(columns=['date', 'time'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()\n", "cat_features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "num_features = X_train.select_dtypes(include=['number']).columns.to_list()\n", "num_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "s_scaler = StandardScaler()\n", "l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом\n", "regressor = CatBoostRegressor()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Column transformer" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Для удобной работы со столбцами\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", " ],\n", " remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n", " ('model', regressor)])\n", "\n", "pipeline.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = pipeline.predict(X_test) \n", "\n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "\n", "# Работаем с MLflow локально\n", "TRACKING_SERVER_HOST = \"127.0.0.1\"\n", "TRACKING_SERVER_PORT = 5000\n", "\n", "registry_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n", "tracking_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n", "\n", "mlflow.set_tracking_uri(tracking_uri) \n", "mlflow.set_registry_uri(registry_uri) \n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться\n", "EXPERIMENT_NAME = \"estate_project\"\n", "RUN_NAME = \"baseline model\"\n", "REGISTRY_MODEL_NAME = \"estate_model_rf\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Логируем вручную" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их\n", "from mlflow.models import infer_signature\n", "\n", "signature = infer_signature(model_input = X_train.head(5))\n", "input_example = X_train.head(5)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Будем логировать requirements и артефакт - текстовый файл\n", "req_file = 'requirements.txt'\n", "art = 'comment.txt'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели\n", "#params_dict = {'n_estimators': 10, 'max_depth': 10}\n", "params_dict = pipeline.get_params()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Когда создаем новый эксперимент, то: \n", "experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n", "\n", "# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n", "#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(params_dict)\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Удаление runs, experiments\n", "\n", "Использовать осторожно" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "#mlflow.delete_experiment(experiment_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mlflow.search_runs(\n", " #experiment_ids=[experiment_id],\n", " experiment_names=[EXPERIMENT_NAME],\n", " # filter_string='status = \"FAILED\"'\n", " #filter_string='metrics.mae > 1'\n", " \n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "\n", "#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Автологирование\n", "После включения будет срабатывать на каждом обучении модели (на методе fit()).\n", "\n", "Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mlflow.sklearn.autolog()\n", "\n", "with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:\n", " pipeline.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Отключаем автологирование\n", "mlflow.sklearn.autolog(disable=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model #2\n", "Обучим вторую \"маленькую\" модель\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n", " ('model', regressor2)])\n", "\n", "pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = pipeline.predict(X_test) \n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй \"маленькой\" модели. \n", "\n", "\n", "RUN_NAME = 'smaller_model'\n", "\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(pipeline.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# No model\n", "# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA\n", "\n", "RUN_NAME = 'no_model'\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " run_id = run.info.run_id \n", " mlflow.log_artifact(art)\n", "\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "run_id = '06fa7ec1f1b74aedb3509c88dc4ee1c0' # Указываем run id\n", "mlflow.register_model(f\"runs:/{run_id}/models\", REGISTRY_MODEL_NAME)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Можно регистрировать сразу при создании прогона\n", "\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file,\n", " registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(pipeline.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Можно найти зарегистрированные модели\n", "model_reg = mlflow.search_registered_models()\n", "model_reg[0]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "\n", "model_name = REGISTRY_MODEL_NAME\n", "model_version = 1\n", "\n", "model_loaded = mlflow.sklearn.load_model(model_uri=f\"models:/{model_name}/{model_version}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_loaded.predict(X_test.iloc[0:1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_test.iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature engineering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sklearn" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn = X_train.copy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### PolynomialFeatures\n", "Создает полином степени `degree` из указанных признаков\n" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "pf = PolynomialFeatures(degree=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pf.fit_transform(X_train_sklearn[['area','kitchen_area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### SplineTransformer\n", "Cоздаёт новую матрицу признаков, состоящую из сплайнов порядка degree. Количество сгенерированных сплайнов равно `n_splines=n_knots + degree - 1` для каждого признака, где\n", "\n", "`n_knots` определяет количество узлов (точек, в которых сопрягаются сплайны) для каждого признака. \n", "\n", "`degree` определяет порядок полинома, используемого для построения сплайнов. " ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "sp = SplineTransformer(n_knots=3, degree=3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sp.fit_transform(X_train_sklearn[['area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### QuantileTransformer\n", "Этот метод преобразует признаки, чтобы они распределялись равномерно или нормально — так данные меньше подвергаются влиянию выбросов. Преобразование применяется к каждому признаку независимо. Идея метода такова: оценить функцию распределения признака, чтобы преобразовать исходные значения в равномерное или нормальное распределение. \n", "\n", "`output_distribution='uniform'` или\n", "`output_distribution='normal'` соответственно\n", "\n", "\n", "Пример использования: если у вас есть данные о доходах с широким диапазоном значений, квантильное преобразование сделает их более сопоставимыми и устойчивыми к выбросам." ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "qt = QuantileTransformer()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qt.fit_transform(X_train_sklearn[['area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Объединяем в ColumnTransformer и создаем Pipeline " ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "pf = PolynomialFeatures(degree=2)\n", "qt = QuantileTransformer()\n", "sp = SplineTransformer(n_knots=3, degree=3)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг\n", "pf_pipeline = Pipeline(steps=[\n", " ('poly', pf),\n", " ('scale', StandardScaler())\n", "])" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "preprocessor_sklearn = ColumnTransformer(\n", " transformers=[\n", " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", " ('quantile', qt,num_features),\n", " ('poly', pf_pipeline, ['area', 'kitchen_area']), # В преобразования добавляем созданный ранее pipeline\n", " ('spline', sp, ['area'])\n", " ],\n", " remainder='drop',\n", " ) # Удаляем столбцы, которые не затронуты преобразования" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Посмотрим что из себя теперь представляет датафрейм" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "## не влезаем в float64 в полиномальном преобразовании. Использовать его нужно с умом!\n", "X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')\n", "X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn)\n", "X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Удобно использовать для отображения всех строк\\столбцов в DataFrame\n", "with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n", " display (X_train_sklearn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Создаем пайплайн с препроцессингом и моделью" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline_sklearn = Pipeline(steps=[\n", " ('transform', preprocessor_sklearn),\n", " ('model', regressor)\n", "])\n", "\n", "model_sklearn = pipeline_sklearn.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_sklearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = model_sklearn.predict(X_test) \n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "RUN_NAME = 'fe_sklearn'\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(model_sklearn, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(model_sklearn.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Autofeat" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "from autofeat import AutoFeatRegressor\n", "transformations = [\"1/\", \"exp\", \"log\", \"abs\", \"sqrt\", \"^2\", \"^3\", \"1+\", \"1-\", \"sin\", \"cos\", \"exp-\", \"2^\"] " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "afreg = AutoFeatRegressor(verbose=1, feateng_steps=2, max_gb=8, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)\n", "X_train_arf = afreg.fit_transform(X_train,y_train)\n", "X_train_arf" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "# Создаем обертку, в которой добавляем метод get_feature_names_out() для получения названий признаков\n", "import numpy as np\n", "\n", "class AutoFeatWrapper():\n", " def __init__(self, feateng_cols, feateng_steps=1, max_gb=16, transformations=[\"1/\", \"exp\", \"log\"], n_jobs=-1, verbose=1):\n", " self.feateng_cols = feateng_cols\n", " self.feateng_steps = feateng_steps\n", " self.max_gb = max_gb\n", " self.transformations = transformations\n", " self.n_jobs = n_jobs\n", " self.afreg = AutoFeatRegressor(feateng_cols=self.feateng_cols,\n", " feateng_steps=self.feateng_steps,\n", " max_gb=self.max_gb,\n", " transformations=self.transformations,\n", " n_jobs=self.n_jobs)\n", " \n", " def fit(self, X, y=None):\n", " self.afreg.fit(X, y)\n", " return self\n", " \n", " def transform(self, X):\n", " return self.afreg.transform(X)\n", " \n", " def get_feature_names_out(self, input_features=None):\n", " # Преобразуем данные и возвращаем имена фичей из DataFrame\n", " transformed_X = self.afreg.transform(pd.DataFrame(np.zeros((1, len(self.feateng_cols))), columns=self.feateng_cols))\n", " return transformed_X.columns.tolist()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "afreg_pipeline = Pipeline(steps=[\n", " ('autofeat', AutoFeatWrapper( feateng_steps=2, max_gb=16, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)),\n", " ('scaler', StandardScaler()),\n", "])" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "preprocessor_afr = ColumnTransformer(\n", " transformers=[\n", " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", " ('afr', afreg_pipeline, num_features), # преобразования autofeat\n", " ],\n", " remainder='drop', # Удаляем столбцы, которые не затронуты преобразованиями\n", " ) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train_afr_raw = preprocessor_afr.fit_transform(X_train,y_train)\n", "X_train_afr = pd.DataFrame(X_train_afr_raw, columns=preprocessor_afr.get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n", " display (X_train_afr)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline_afr = Pipeline(steps=[('preprocessor', preprocessor_afr), \n", " ('model', regressor)])\n", "\n", "pipeline_afr.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = pipeline_afr.predict(X_test) \n", "\n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name='autofeat', experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline_afr, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(pipeline_afr.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv_labs_proj", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }