IIS/assets/mlflow/research.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import mlflow\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "import pandas as pd\n",
    "import numpy\n",
    "\n",
    "from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n",
    "\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.ensemble import RandomForestRegressor\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from catboost import CatBoostRegressor\n",
    "\n",
    "from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции\n",
    "df.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.rename(columns={'price': 'target'})\n",
    "df = df.drop(columns=['date', 'time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()\n",
    "cat_features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_features = X_train.select_dtypes(include=['number']).columns.to_list()\n",
    "num_features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "s_scaler = StandardScaler()\n",
    "l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом\n",
    "regressor = CatBoostRegressor()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Column transformer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Для удобной работы со столбцами\n",
    "preprocessor = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', s_scaler, num_features),  # преобразования для числовых признаков\n",
    "        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
    "    ],\n",
    "    remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n",
    "                           ('model', regressor)])\n",
    "\n",
    "pipeline.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = pipeline.predict(X_test) \n",
    "\n",
    "metrics = {}\n",
    "metrics[\"mae\"] = mean_absolute_error(y_test, predictions)   \n",
    "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
    "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# Работаем с MLflow локально\n",
    "TRACKING_SERVER_HOST = \"127.0.0.1\"\n",
    "TRACKING_SERVER_PORT = 5000\n",
    "\n",
    "registry_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n",
    "tracking_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n",
    "\n",
    "mlflow.set_tracking_uri(tracking_uri)   \n",
    "mlflow.set_registry_uri(registry_uri)   \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться\n",
    "EXPERIMENT_NAME = \"estate_project\"\n",
    "RUN_NAME = \"baseline model\"\n",
    "REGISTRY_MODEL_NAME = \"estate_model_rf\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Логируем вручную"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их\n",
    "from mlflow.models import infer_signature\n",
    "\n",
    "signature =  infer_signature(model_input = X_train.head(5))\n",
    "input_example = X_train.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Будем логировать requirements и артефакт - текстовый файл\n",
    "req_file = 'requirements.txt'\n",
    "art = 'comment.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели\n",
    "#params_dict = {'n_estimators': 10, 'max_depth': 10}\n",
    "params_dict = pipeline.get_params()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Когда создаем новый эксперимент, то: \n",
    "experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n",
    "\n",
    "# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n",
    "#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
    "\n",
    "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
    "    # получаем уникальный идентификатор запуска эксперимента\n",
    "    run_id = run.info.run_id \n",
    "    mlflow.sklearn.log_model(pipeline, \n",
    "                             artifact_path=\"models\",\n",
    "                             signature=signature,\n",
    "                             input_example=input_example,\n",
    "                             pip_requirements=req_file\n",
    "                             )\n",
    "    mlflow.log_metrics(metrics)\n",
    "    mlflow.log_artifact(art)\n",
    "    mlflow.log_params(params_dict)\n",
    "\n",
    "run = mlflow.get_run(run_id) \n",
    "assert (run.info.status =='FINISHED')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Удаление runs, experiments\n",
    "\n",
    "Использовать осторожно"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
    "#mlflow.delete_experiment(experiment_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mlflow.search_runs(\n",
    "    #experiment_ids=[experiment_id],\n",
    "    experiment_names=[EXPERIMENT_NAME],\n",
    "    # filter_string='status = \"FAILED\"'\n",
    "    #filter_string='metrics.mae > 1'\n",
    "    \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Автологирование\n",
    "После включения будет срабатывать на каждом обучении модели (на методе fit()).\n",
    "\n",
    "Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mlflow.sklearn.autolog()\n",
    "\n",
    "with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:\n",
    "    pipeline.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Отключаем автологирование\n",
    "mlflow.sklearn.autolog(disable=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model #2\n",
    "Обучим вторую \"маленькую\" модель\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n",
    "                           ('model', regressor2)])\n",
    "\n",
    "pipeline.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = pipeline.predict(X_test) \n",
    "metrics = {}\n",
    "metrics[\"mae\"] = mean_absolute_error(y_test, predictions)   \n",
    "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
    "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй \"маленькой\" модели. \n",
    "\n",
    "\n",
    "RUN_NAME = 'smaller_model'\n",
    "\n",
    "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
    "\n",
    "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
    "    # получаем уникальный идентификатор запуска эксперимента\n",
    "    run_id = run.info.run_id \n",
    "    mlflow.sklearn.log_model(pipeline, \n",
    "                             artifact_path=\"models\",\n",
    "                             signature=signature,\n",
    "                             input_example=input_example,\n",
    "                             pip_requirements=req_file\n",
    "                             )\n",
    "    mlflow.log_metrics(metrics)\n",
    "    mlflow.log_artifact(art)\n",
    "    mlflow.log_params(pipeline.get_params())\n",
    "\n",
    "run = mlflow.get_run(run_id) \n",
    "assert (run.info.status =='FINISHED')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# No model\n",
    "# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA\n",
    "\n",
    "RUN_NAME = 'no_model'\n",
    "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
    "\n",
    "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
    "    run_id = run.info.run_id \n",
    "    mlflow.log_artifact(art)\n",
    "\n",
    "\n",
    "run = mlflow.get_run(run_id) \n",
    "assert (run.info.status =='FINISHED')\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "run_id = '06fa7ec1f1b74aedb3509c88dc4ee1c0' # Указываем run id\n",
    "mlflow.register_model(f\"runs:/{run_id}/models\", REGISTRY_MODEL_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Можно регистрировать сразу при создании прогона\n",
    "\n",
    "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
    "\n",
    "with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:\n",
    "    # получаем уникальный идентификатор запуска эксперимента\n",
    "    run_id = run.info.run_id \n",
    "    mlflow.sklearn.log_model(pipeline, \n",
    "                             artifact_path=\"models\",\n",
    "                             signature=signature,\n",
    "                             input_example=input_example,\n",
    "                             pip_requirements=req_file,\n",
    "                             registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем\n",
    "                             )\n",
    "    mlflow.log_metrics(metrics)\n",
    "    mlflow.log_artifact(art)\n",
    "    mlflow.log_params(pipeline.get_params())\n",
    "\n",
    "run = mlflow.get_run(run_id) \n",
    "assert (run.info.status =='FINISHED')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Можно найти зарегистрированные модели\n",
    "model_reg = mlflow.search_registered_models()\n",
    "model_reg[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "model_name = REGISTRY_MODEL_NAME\n",
    "model_version = 1\n",
    "\n",
    "model_loaded = mlflow.sklearn.load_model(model_uri=f\"models:/{model_name}/{model_version}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_loaded.predict(X_test.iloc[0:1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_test.iloc[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature engineering"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_sklearn = X_train.copy()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### PolynomialFeatures\n",
    "Создает полином степени `degree` из указанных признаков\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "pf = PolynomialFeatures(degree=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pf.fit_transform(X_train_sklearn[['area','kitchen_area']])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### SplineTransformer\n",
    "Cоздаёт новую матрицу признаков, состоящую из сплайнов порядка degree. Количество сгенерированных сплайнов равно `n_splines=n_knots + degree - 1` для каждого признака, где\n",
    "\n",
    "`n_knots` определяет количество узлов (точек, в которых сопрягаются сплайны) для каждого признака. \n",
    "\n",
    "`degree` определяет порядок полинома, используемого для построения сплайнов. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "sp = SplineTransformer(n_knots=3, degree=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sp.fit_transform(X_train_sklearn[['area']])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### QuantileTransformer\n",
    "Этот метод преобразует признаки, чтобы они распределялись равномерно или нормально — так данные меньше подвергаются влиянию выбросов. Преобразование применяется к каждому признаку независимо. Идея метода такова: оценить функцию распределения признака, чтобы преобразовать исходные значения в равномерное или нормальное распределение. \n",
    "\n",
    "`output_distribution='uniform'` или\n",
    "`output_distribution='normal'` соответственно\n",
    "\n",
    "\n",
    "Пример использования: если у вас есть данные о доходах с широким диапазоном значений, квантильное преобразование сделает их более сопоставимыми и устойчивыми к выбросам."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "qt = QuantileTransformer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "qt.fit_transform(X_train_sklearn[['area']])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Объединяем в ColumnTransformer и создаем Pipeline "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "pf = PolynomialFeatures(degree=2)\n",
    "qt = QuantileTransformer()\n",
    "sp = SplineTransformer(n_knots=3, degree=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг\n",
    "pf_pipeline = Pipeline(steps=[\n",
    "    ('poly', pf),\n",
    "    ('scale', StandardScaler())\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessor_sklearn = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', s_scaler, num_features),  # преобразования для числовых признаков\n",
    "        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
    "        ('quantile', qt,num_features),\n",
    "        ('poly', pf_pipeline, ['area', 'kitchen_area']), # В преобразования добавляем созданный ранее pipeline\n",
    "        ('spline', sp, ['area'])\n",
    "    ],\n",
    "    remainder='drop',\n",
    "    ) # Удаляем столбцы, которые не затронуты преобразования"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Посмотрим что из себя теперь представляет датафрейм"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "## не влезаем в float64 в полиномальном преобразовании. Использовать его нужно с умом!\n",
    "X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')\n",
    "X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn)\n",
    "X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Удобно использовать для отображения всех строк\\столбцов в DataFrame\n",
    "with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n",
    "    display (X_train_sklearn)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Создаем пайплайн с препроцессингом и моделью"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline_sklearn = Pipeline(steps=[\n",
    "    ('transform', preprocessor_sklearn),\n",
    "    ('model', regressor)\n",
    "])\n",
    "\n",
    "model_sklearn = pipeline_sklearn.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = model_sklearn.predict(X_test) \n",
    "metrics = {}\n",
    "metrics[\"mae\"] = mean_absolute_error(y_test, predictions)   \n",
    "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
    "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
    "RUN_NAME = 'fe_sklearn'\n",
    "\n",
    "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
    "    # получаем уникальный идентификатор запуска эксперимента\n",
    "    run_id = run.info.run_id \n",
    "    mlflow.sklearn.log_model(model_sklearn, \n",
    "                             artifact_path=\"models\",\n",
    "                             signature=signature,\n",
    "                             input_example=input_example,\n",
    "                             pip_requirements=req_file\n",
    "                             )\n",
    "    mlflow.log_metrics(metrics)\n",
    "    mlflow.log_artifact(art)\n",
    "    mlflow.log_params(model_sklearn.get_params())\n",
    "\n",
    "run = mlflow.get_run(run_id) \n",
    "assert (run.info.status =='FINISHED')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Autofeat"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "from autofeat import AutoFeatRegressor\n",
    "transformations = [\"1/\", \"exp\", \"log\", \"abs\", \"sqrt\", \"^2\", \"^3\", \"1+\", \"1-\", \"sin\", \"cos\", \"exp-\", \"2^\"] "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "afreg = AutoFeatRegressor(verbose=1, feateng_steps=2, max_gb=8, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)\n",
    "X_train_arf = afreg.fit_transform(X_train,y_train)\n",
    "X_train_arf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Создаем обертку, в которой добавляем метод get_feature_names_out() для получения названий признаков\n",
    "import numpy as np\n",
    "\n",
    "class AutoFeatWrapper():\n",
    "    def __init__(self, feateng_cols, feateng_steps=1, max_gb=16, transformations=[\"1/\", \"exp\", \"log\"], n_jobs=-1, verbose=1):\n",
    "        self.feateng_cols = feateng_cols\n",
    "        self.feateng_steps = feateng_steps\n",
    "        self.max_gb = max_gb\n",
    "        self.transformations = transformations\n",
    "        self.n_jobs = n_jobs\n",
    "        self.afreg = AutoFeatRegressor(feateng_cols=self.feateng_cols,\n",
    "                                     feateng_steps=self.feateng_steps,\n",
    "                                     max_gb=self.max_gb,\n",
    "                                     transformations=self.transformations,\n",
    "                                     n_jobs=self.n_jobs)\n",
    "        \n",
    "    def fit(self, X, y=None):\n",
    "        self.afreg.fit(X, y)\n",
    "        return self\n",
    "    \n",
    "    def transform(self, X):\n",
    "        return self.afreg.transform(X)\n",
    "    \n",
    "    def get_feature_names_out(self, input_features=None):\n",
    "        # Преобразуем данные и возвращаем имена фичей из DataFrame\n",
    "        transformed_X = self.afreg.transform(pd.DataFrame(np.zeros((1, len(self.feateng_cols))), columns=self.feateng_cols))\n",
    "        return transformed_X.columns.tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "afreg_pipeline = Pipeline(steps=[\n",
    "    ('autofeat', AutoFeatWrapper( feateng_steps=2, max_gb=16, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)),\n",
    "    ('scaler', StandardScaler()),\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "preprocessor_afr = ColumnTransformer(\n",
    "    transformers=[\n",
    "        ('num', s_scaler, num_features),  # преобразования для числовых признаков\n",
    "        ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
    "        ('afr', afreg_pipeline, num_features), # преобразования autofeat\n",
    "    ],\n",
    "    remainder='drop', # Удаляем столбцы, которые не затронуты преобразованиями\n",
    "    ) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_afr_raw =  preprocessor_afr.fit_transform(X_train,y_train)\n",
    "X_train_afr = pd.DataFrame(X_train_afr_raw, columns=preprocessor_afr.get_feature_names_out())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n",
    "    display (X_train_afr)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pipeline_afr = Pipeline(steps=[('preprocessor', preprocessor_afr), \n",
    "                               ('model', regressor)])\n",
    "\n",
    "pipeline_afr.fit(X_train, y_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = pipeline_afr.predict(X_test) \n",
    "\n",
    "metrics = {}\n",
    "metrics[\"mae\"] = mean_absolute_error(y_test, predictions)   \n",
    "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
    "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
    "\n",
    "metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
    "\n",
    "with mlflow.start_run(run_name='autofeat', experiment_id=experiment_id) as run:\n",
    "    # получаем уникальный идентификатор запуска эксперимента\n",
    "    run_id = run.info.run_id \n",
    "    mlflow.sklearn.log_model(pipeline_afr, \n",
    "                             artifact_path=\"models\",\n",
    "                             signature=signature,\n",
    "                             input_example=input_example,\n",
    "                             pip_requirements=req_file\n",
    "                             )\n",
    "    mlflow.log_metrics(metrics)\n",
    "    mlflow.log_artifact(art)\n",
    "    mlflow.log_params(pipeline_afr.get_params())\n",
    "\n",
    "run = mlflow.get_run(run_id) \n",
    "assert (run.info.status =='FINISHED')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv_labs_proj",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}