{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import mlflow\n", "\n", "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n", "import numpy\n", "\n", "from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n", "\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import LinearRegression\n", "from catboost import CatBoostRegressor\n", "\n", "from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = df.rename(columns={'price': 'target'})\n", "df = df.drop(columns=['date', 'time'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()\n", "cat_features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "num_features = X_train.select_dtypes(include=['number']).columns.to_list()\n", "num_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "s_scaler = StandardScaler()\n", "l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом\n", "regressor = CatBoostRegressor()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Column transformer" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Для удобной работы со столбцами\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", " ],\n", " remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n", " ('model', regressor)])\n", "\n", "pipeline.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = pipeline.predict(X_test) \n", "\n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "\n", "# Работаем с MLflow локально\n", "TRACKING_SERVER_HOST = \"127.0.0.1\"\n", "TRACKING_SERVER_PORT = 5000\n", "\n", "registry_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n", "tracking_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n", "\n", "mlflow.set_tracking_uri(tracking_uri) \n", "mlflow.set_registry_uri(registry_uri) \n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться\n", "EXPERIMENT_NAME = \"estate_project\"\n", "RUN_NAME = \"baseline model\"\n", "REGISTRY_MODEL_NAME = \"estate_model_rf\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Логируем вручную" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их\n", "from mlflow.models import infer_signature\n", "\n", "signature = infer_signature(model_input = X_train.head(5))\n", "input_example = X_train.head(5)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Будем логировать requirements и артефакт - текстовый файл\n", "req_file = 'requirements.txt'\n", "art = 'comment.txt'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели\n", "#params_dict = {'n_estimators': 10, 'max_depth': 10}\n", "params_dict = pipeline.get_params()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Когда создаем новый эксперимент, то: \n", "experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n", "\n", "# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n", "#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(params_dict)\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Удаление runs, experiments\n", "\n", "Использовать осторожно" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "#mlflow.delete_experiment(experiment_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mlflow.search_runs(\n", " #experiment_ids=[experiment_id],\n", " experiment_names=[EXPERIMENT_NAME],\n", " # filter_string='status = \"FAILED\"'\n", " #filter_string='metrics.mae > 1'\n", " \n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "\n", "#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Автологирование\n", "После включения будет срабатывать на каждом обучении модели (на методе fit()).\n", "\n", "Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mlflow.sklearn.autolog()\n", "\n", "with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:\n", " pipeline.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Отключаем автологирование\n", "mlflow.sklearn.autolog(disable=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model #2\n", "Обучим вторую \"маленькую\" модель\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n", " ('model', regressor2)])\n", "\n", "pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = pipeline.predict(X_test) \n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй \"маленькой\" модели. \n", "\n", "\n", "RUN_NAME = 'smaller_model'\n", "\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(pipeline.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# No model\n", "# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA\n", "\n", "RUN_NAME = 'no_model'\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " run_id = run.info.run_id \n", " mlflow.log_artifact(art)\n", "\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "run_id = '06fa7ec1f1b74aedb3509c88dc4ee1c0' # Указываем run id\n", "mlflow.register_model(f\"runs:/{run_id}/models\", REGISTRY_MODEL_NAME)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Можно регистрировать сразу при создании прогона\n", "\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file,\n", " registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(pipeline.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Можно найти зарегистрированные модели\n", "model_reg = mlflow.search_registered_models()\n", "model_reg[0]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "\n", "model_name = REGISTRY_MODEL_NAME\n", "model_version = 1\n", "\n", "model_loaded = mlflow.sklearn.load_model(model_uri=f\"models:/{model_name}/{model_version}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_loaded.predict(X_test.iloc[0:1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_test.iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature engineering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sklearn" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn = X_train.copy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### PolynomialFeatures\n", "Создает полином степени `degree` из указанных признаков\n" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "pf = PolynomialFeatures(degree=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pf.fit_transform(X_train_sklearn[['area','kitchen_area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### SplineTransformer\n", "Cоздаёт новую матрицу признаков, состоящую из сплайнов порядка degree. Количество сгенерированных сплайнов равно `n_splines=n_knots + degree - 1` для каждого признака, где\n", "\n", "`n_knots` определяет количество узлов (точек, в которых сопрягаются сплайны) для каждого признака. \n", "\n", "`degree` определяет порядок полинома, используемого для построения сплайнов. " ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "sp = SplineTransformer(n_knots=3, degree=3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sp.fit_transform(X_train_sklearn[['area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### QuantileTransformer\n", "Этот метод преобразует признаки, чтобы они распределялись равномерно или нормально — так данные меньше подвергаются влиянию выбросов. Преобразование применяется к каждому признаку независимо. Идея метода такова: оценить функцию распределения признака, чтобы преобразовать исходные значения в равномерное или нормальное распределение. \n", "\n", "`output_distribution='uniform'` или\n", "`output_distribution='normal'` соответственно\n", "\n", "\n", "Пример использования: если у вас есть данные о доходах с широким диапазоном значений, квантильное преобразование сделает их более сопоставимыми и устойчивыми к выбросам." ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "qt = QuantileTransformer()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qt.fit_transform(X_train_sklearn[['area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Объединяем в ColumnTransformer и создаем Pipeline " ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "pf = PolynomialFeatures(degree=2)\n", "qt = QuantileTransformer()\n", "sp = SplineTransformer(n_knots=3, degree=3)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг\n", "pf_pipeline = Pipeline(steps=[\n", " ('poly', pf),\n", " ('scale', StandardScaler())\n", "])" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "preprocessor_sklearn = ColumnTransformer(\n", " transformers=[\n", " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", " ('quantile', qt,num_features),\n", " ('poly', pf_pipeline, ['area', 'kitchen_area']), # В преобразования добавляем созданный ранее pipeline\n", " ('spline', sp, ['area'])\n", " ],\n", " remainder='drop',\n", " ) # Удаляем столбцы, которые не затронуты преобразования" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Посмотрим что из себя теперь представляет датафрейм" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "## не влезаем в float64 в полиномальном преобразовании. Использовать его нужно с умом!\n", "X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')\n", "X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn)\n", "X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num__geo_latnum__geo_lonnum__levelnum__levelsnum__roomsnum__areanum__kitchen_areacat__regioncat__building_typecat__object_typequantile__geo_latquantile__geo_lonquantile__levelquantile__levelsquantile__roomsquantile__areaquantile__kitchen_areapoly__1poly__areapoly__kitchen_areapoly__area^2poly__area kitchen_areapoly__kitchen_area^2spline__area_sp_0spline__area_sp_1spline__area_sp_2spline__area_sp_3spline__area_sp_4
00.495902-0.4497420.359235-0.2147890.2534130.063735-0.18628520.01.00.00.7662570.5110280.7172170.5365370.6006010.6236240.3748750.00.063735-0.186285-0.010002-0.132188-0.0027920.1558060.6661790.1780130.0000020.0
10.1778061.433673-0.246529-0.3677180.253413-0.114293-0.18628570.01.00.00.2971420.8679990.5220220.3868870.6006010.5415420.3748750.0-0.114293-0.186285-0.017375-0.169370-0.0027920.1569210.6662750.1768030.0000010.0
.......................................................................................
410773-0.748366-0.804077-0.6503710.7027880.2534131.3654411.50183352.03.00.00.1931430.1147530.3098100.7417420.6006010.9613670.9845350.01.3654411.5018330.0684381.5701630.0086160.1478200.6651590.1870110.0000100.0
4107741.257769-1.101815-0.0446080.0910701.1759110.553789-0.14254414.01.00.00.9080360.0757250.6046050.6456460.8673670.8418420.4364360.00.553789-0.1425440.014463-0.002742-0.0026490.1527670.6658600.1813700.0000040.0
\n", "

410775 rows × 28 columns

\n", "
" ], "text/plain": [ " num__geo_lat num__geo_lon num__level num__levels num__rooms \\\n", "0 0.495902 -0.449742 0.359235 -0.214789 0.253413 \n", "1 0.177806 1.433673 -0.246529 -0.367718 0.253413 \n", "... ... ... ... ... ... \n", "410773 -0.748366 -0.804077 -0.650371 0.702788 0.253413 \n", "410774 1.257769 -1.101815 -0.044608 0.091070 1.175911 \n", "\n", " num__area num__kitchen_area cat__region cat__building_type \\\n", "0 0.063735 -0.186285 20.0 1.0 \n", "1 -0.114293 -0.186285 70.0 1.0 \n", "... ... ... ... ... \n", "410773 1.365441 1.501833 52.0 3.0 \n", "410774 0.553789 -0.142544 14.0 1.0 \n", "\n", " cat__object_type quantile__geo_lat quantile__geo_lon \\\n", "0 0.0 0.766257 0.511028 \n", "1 0.0 0.297142 0.867999 \n", "... ... ... ... \n", "410773 0.0 0.193143 0.114753 \n", "410774 0.0 0.908036 0.075725 \n", "\n", " quantile__level quantile__levels quantile__rooms quantile__area \\\n", "0 0.717217 0.536537 0.600601 0.623624 \n", "1 0.522022 0.386887 0.600601 0.541542 \n", "... ... ... ... ... \n", "410773 0.309810 0.741742 0.600601 0.961367 \n", "410774 0.604605 0.645646 0.867367 0.841842 \n", "\n", " quantile__kitchen_area poly__1 poly__area poly__kitchen_area \\\n", "0 0.374875 0.0 0.063735 -0.186285 \n", "1 0.374875 0.0 -0.114293 -0.186285 \n", "... ... ... ... ... \n", "410773 0.984535 0.0 1.365441 1.501833 \n", "410774 0.436436 0.0 0.553789 -0.142544 \n", "\n", " poly__area^2 poly__area kitchen_area poly__kitchen_area^2 \\\n", "0 -0.010002 -0.132188 -0.002792 \n", "1 -0.017375 -0.169370 -0.002792 \n", "... ... ... ... \n", "410773 0.068438 1.570163 0.008616 \n", "410774 0.014463 -0.002742 -0.002649 \n", "\n", " spline__area_sp_0 spline__area_sp_1 spline__area_sp_2 \\\n", "0 0.155806 0.666179 0.178013 \n", "1 0.156921 0.666275 0.176803 \n", "... ... ... ... \n", "410773 0.147820 0.665159 0.187011 \n", "410774 0.152767 0.665860 0.181370 \n", "\n", " spline__area_sp_3 spline__area_sp_4 \n", "0 0.000002 0.0 \n", "1 0.000001 0.0 \n", "... ... ... \n", "410773 0.000010 0.0 \n", "410774 0.000004 0.0 \n", "\n", "[410775 rows x 28 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Удобно использовать для отображения всех строк\\столбцов в DataFrame\n", "with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n", " display (X_train_sklearn)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Создаем пайплайн с препроцессингом и моделью" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline_sklearn = Pipeline(steps=[\n", " ('transform', preprocessor_sklearn),\n", " ('model', regressor)\n", "])\n", "\n", "model_sklearn = pipeline_sklearn.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_sklearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = model_sklearn.predict(X_test) \n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "RUN_NAME = 'fe_sklearn'\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(model_sklearn, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(model_sklearn.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Autofeat" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [], "source": [ "from autofeat import AutoFeatRegressor\n", "transformations = [\"1/\", \"exp\", \"log\", \"abs\", \"sqrt\", \"^2\", \"^3\", \"1+\", \"1-\", \"sin\", \"cos\", \"exp-\", \"2^\"] " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "afreg = AutoFeatRegressor(verbose=1, feateng_steps=2, max_gb=8, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)\n", "X_train_arf = afreg.fit_transform(X_train,y_train)\n", "X_train_arf" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "# Создаем обертку, в которой добавляем метод get_feature_names_out() для получения названий признаков\n", "import numpy as np\n", "\n", "class AutoFeatWrapper():\n", " def __init__(self, feateng_cols, feateng_steps=1, max_gb=16, transformations=[\"1/\", \"exp\", \"log\"], n_jobs=-1, verbose=1):\n", " self.feateng_cols = feateng_cols\n", " self.feateng_steps = feateng_steps\n", " self.max_gb = max_gb\n", " self.transformations = transformations\n", " self.n_jobs = n_jobs\n", " self.afreg = AutoFeatRegressor(feateng_cols=self.feateng_cols,\n", " feateng_steps=self.feateng_steps,\n", " max_gb=self.max_gb,\n", " transformations=self.transformations,\n", " n_jobs=self.n_jobs)\n", " \n", " def fit(self, X, y=None):\n", " self.afreg.fit(X, y)\n", " return self\n", " \n", " def transform(self, X):\n", " return self.afreg.transform(X)\n", " \n", " def get_feature_names_out(self, input_features=None):\n", " # Преобразуем данные и возвращаем имена фичей из DataFrame\n", " transformed_X = self.afreg.transform(pd.DataFrame(np.zeros((1, len(self.feateng_cols))), columns=self.feateng_cols))\n", " return transformed_X.columns.tolist()" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "afreg_pipeline = Pipeline(steps=[\n", " ('autofeat', AutoFeatWrapper( feateng_steps=2, max_gb=16, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)),\n", " ('scaler', StandardScaler()),\n", "])" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "preprocessor_afr = ColumnTransformer(\n", " transformers=[\n", " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", " ('afr', afreg_pipeline, num_features), # преобразования autofeat\n", " ],\n", " remainder='drop', # Удаляем столбцы, которые не затронуты преобразованиями\n", " ) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train_afr_raw = preprocessor_afr.fit_transform(X_train,y_train)\n", "X_train_afr = pd.DataFrame(X_train_afr_raw, columns=preprocessor_afr.get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num__geo_latnum__geo_lonnum__levelnum__levelsnum__roomsnum__areanum__kitchen_areacat__regioncat__building_typecat__object_typeafr__geo_latafr__geo_lonafr__levelafr__levelsafr__roomsafr__areaafr__kitchen_areaafr__area*roomsafr__area*geo_lonafr__levels*roomsafr__area*kitchen_areaafr__sqrt(area)*geo_latafr__sqrt(area)*log(level)afr__kitchen_area*log(level)afr__sqrt(area)*kitchen_areaafr__geo_lon*log(kitchen_area)afr__sqrt(area)*sqrt(kitchen_area)afr__sqrt(geo_lon)*sqrt(kitchen_area)afr__log(area)afr__rooms*log(level)afr__kitchen_area*roomsafr__kitchen_area*levelsafr__sqrt(geo_lon)*sqrt(level)afr__area**(3/2)afr__geo_lat*log(kitchen_area)afr__geo_lat*log(geo_lon)
00.495902-0.4497420.359235-0.2147890.2534130.063735-0.18628520.01.00.00.495902-0.4497420.359235-0.2147890.2534130.063735-0.1862850.006208-0.1951290.060916-0.1321880.3731510.6880760.044178-0.211335-0.481294-0.153548-0.4908050.3078350.690329-0.132529-0.3528340.323880-0.008748-0.0315290.068167
10.1778061.433673-0.246529-0.3677180.253413-0.114293-0.18628570.01.00.00.1778061.433673-0.246529-0.3677180.253413-0.114293-0.186285-0.0834020.655053-0.054279-0.1693700.0051140.071369-0.173647-0.2527751.191304-0.2672680.6157980.0319070.282625-0.132529-0.4186430.552794-0.056540-0.1438291.129118
...............................................................................................................
410773-0.748366-0.804077-0.6503710.7027880.2534131.3654411.50183352.03.00.0-0.748366-0.804077-0.6503710.7027880.2534131.3654411.5018330.6614270.3751990.7520881.5701631.274445-0.0025210.7455072.3822580.0715992.8288901.4312721.729715-0.1604911.5814362.432437-0.8431500.4114751.671069-1.052343
4107741.257769-1.101815-0.0446080.0910701.1759110.553789-0.14254414.01.00.01.257769-1.101815-0.0446080.0910701.1759110.553789-0.1425440.807887-0.3300700.982478-0.0027421.3389960.635065-0.040302-0.055435-1.0255880.202136-0.9160540.9406241.2179100.311575-0.174762-0.4153590.1356170.359680-0.246790
\n", "

410775 rows × 36 columns

\n", "
" ], "text/plain": [ " num__geo_lat num__geo_lon num__level num__levels num__rooms \\\n", "0 0.495902 -0.449742 0.359235 -0.214789 0.253413 \n", "1 0.177806 1.433673 -0.246529 -0.367718 0.253413 \n", "... ... ... ... ... ... \n", "410773 -0.748366 -0.804077 -0.650371 0.702788 0.253413 \n", "410774 1.257769 -1.101815 -0.044608 0.091070 1.175911 \n", "\n", " num__area num__kitchen_area cat__region cat__building_type \\\n", "0 0.063735 -0.186285 20.0 1.0 \n", "1 -0.114293 -0.186285 70.0 1.0 \n", "... ... ... ... ... \n", "410773 1.365441 1.501833 52.0 3.0 \n", "410774 0.553789 -0.142544 14.0 1.0 \n", "\n", " cat__object_type afr__geo_lat afr__geo_lon afr__level afr__levels \\\n", "0 0.0 0.495902 -0.449742 0.359235 -0.214789 \n", "1 0.0 0.177806 1.433673 -0.246529 -0.367718 \n", "... ... ... ... ... ... \n", "410773 0.0 -0.748366 -0.804077 -0.650371 0.702788 \n", "410774 0.0 1.257769 -1.101815 -0.044608 0.091070 \n", "\n", " afr__rooms afr__area afr__kitchen_area afr__area*rooms \\\n", "0 0.253413 0.063735 -0.186285 0.006208 \n", "1 0.253413 -0.114293 -0.186285 -0.083402 \n", "... ... ... ... ... \n", "410773 0.253413 1.365441 1.501833 0.661427 \n", "410774 1.175911 0.553789 -0.142544 0.807887 \n", "\n", " afr__area*geo_lon afr__levels*rooms afr__area*kitchen_area \\\n", "0 -0.195129 0.060916 -0.132188 \n", "1 0.655053 -0.054279 -0.169370 \n", "... ... ... ... \n", "410773 0.375199 0.752088 1.570163 \n", "410774 -0.330070 0.982478 -0.002742 \n", "\n", " afr__sqrt(area)*geo_lat afr__sqrt(area)*log(level) \\\n", "0 0.373151 0.688076 \n", "1 0.005114 0.071369 \n", "... ... ... \n", "410773 1.274445 -0.002521 \n", "410774 1.338996 0.635065 \n", "\n", " afr__kitchen_area*log(level) afr__sqrt(area)*kitchen_area \\\n", "0 0.044178 -0.211335 \n", "1 -0.173647 -0.252775 \n", "... ... ... \n", "410773 0.745507 2.382258 \n", "410774 -0.040302 -0.055435 \n", "\n", " afr__geo_lon*log(kitchen_area) afr__sqrt(area)*sqrt(kitchen_area) \\\n", "0 -0.481294 -0.153548 \n", "1 1.191304 -0.267268 \n", "... ... ... \n", "410773 0.071599 2.828890 \n", "410774 -1.025588 0.202136 \n", "\n", " afr__sqrt(geo_lon)*sqrt(kitchen_area) afr__log(area) \\\n", "0 -0.490805 0.307835 \n", "1 0.615798 0.031907 \n", "... ... ... \n", "410773 1.431272 1.729715 \n", "410774 -0.916054 0.940624 \n", "\n", " afr__rooms*log(level) afr__kitchen_area*rooms \\\n", "0 0.690329 -0.132529 \n", "1 0.282625 -0.132529 \n", "... ... ... \n", "410773 -0.160491 1.581436 \n", "410774 1.217910 0.311575 \n", "\n", " afr__kitchen_area*levels afr__sqrt(geo_lon)*sqrt(level) \\\n", "0 -0.352834 0.323880 \n", "1 -0.418643 0.552794 \n", "... ... ... \n", "410773 2.432437 -0.843150 \n", "410774 -0.174762 -0.415359 \n", "\n", " afr__area**(3/2) afr__geo_lat*log(kitchen_area) \\\n", "0 -0.008748 -0.031529 \n", "1 -0.056540 -0.143829 \n", "... ... ... \n", "410773 0.411475 1.671069 \n", "410774 0.135617 0.359680 \n", "\n", " afr__geo_lat*log(geo_lon) \n", "0 0.068167 \n", "1 1.129118 \n", "... ... \n", "410773 -1.052343 \n", "410774 -0.246790 \n", "\n", "[410775 rows x 36 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n", " display (X_train_afr)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline_afr = Pipeline(steps=[('preprocessor', preprocessor_afr), \n", " ('model', regressor)])\n", "\n", "pipeline_afr.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = pipeline_afr.predict(X_test) \n", "\n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name='autofeat', experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline_afr, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(pipeline_afr.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# FEATURE SELECTION\n", "## RFE\n", "### Используем autofeat признаки\n", "Поскольку autofeat дает разные совокупности сгенерированных признаков, мы можем добавить выбор информативных только как шаг пайплайна " ] }, { "cell_type": "code", "execution_count": 294, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num__geo_latnum__geo_lonnum__levelnum__levelsnum__roomsnum__areanum__kitchen_areacat__regioncat__building_typecat__object_type...afr__sqrt(area)*sqrt(kitchen_area)afr__sqrt(geo_lon)*sqrt(kitchen_area)afr__log(area)afr__rooms*log(level)afr__kitchen_area*roomsafr__kitchen_area*levelsafr__sqrt(geo_lon)*sqrt(level)afr__area**(3/2)afr__geo_lat*log(kitchen_area)afr__geo_lat*log(geo_lon)
00.495902-0.4497420.359235-0.2147890.2534130.063735-0.18628520.01.00.0...-0.153548-0.4908050.3078350.690329-0.132529-0.3528340.323880-0.008748-0.0315290.068167
10.1778061.433673-0.246529-0.3677180.253413-0.114293-0.18628570.01.00.0...-0.2672680.6157980.0319070.282625-0.132529-0.4186430.552794-0.056540-0.1438291.129118
20.4405480.047222-0.448450-0.367718-0.669085-0.456947-0.14254415.03.01.0...-0.454880-0.067183-0.603122-0.512211-0.487813-0.383803-0.243092-0.1408000.0634640.460495
3-1.588818-0.722477-0.246529-0.9794360.253413-0.181292-0.14254418.01.00.0...-0.254514-0.607607-0.0803040.282625-0.088119-0.662523-0.369355-0.073838-0.672113-1.481033
41.4936621.1258190.1573130.5498580.2534130.615045-0.01132210.02.00.0...0.4386000.8913831.0096120.5744970.0451120.2084780.9459810.1549020.7808551.923382
..................................................................
4107700.5920110.3550140.5611561.0086460.253413-0.079836-0.09265354.02.00.0...-0.1200350.2375800.0877250.792500-0.0374630.3227970.974381-0.0474960.2430180.789871
4107710.2404780.392697-0.650371-0.9794360.253413-0.334434-0.40498945.03.00.0...-0.716150-0.510766-0.357277-0.160491-0.354582-0.778657-0.406361-0.111897-0.8081570.574534
410772-1.936771-0.6888300.3592350.855717-0.669085-0.456947-0.14254418.00.01.0...-0.454880-0.581851-0.603122-0.211576-0.4878130.1736380.170166-0.140800-0.798234-1.663294
410773-0.748366-0.804077-0.6503710.7027880.2534131.3654411.50183352.03.00.0...2.8288901.4312721.729715-0.1604911.5814362.432437-0.8431500.4114751.671069-1.052343
4107741.257769-1.101815-0.0446080.0910701.1759110.553789-0.14254414.01.00.0...0.202136-0.9160540.9406241.2179100.311575-0.174762-0.4153590.1356170.359680-0.246790
\n", "

410775 rows × 36 columns

\n", "
" ], "text/plain": [ " num__geo_lat num__geo_lon num__level num__levels num__rooms \\\n", "0 0.495902 -0.449742 0.359235 -0.214789 0.253413 \n", "1 0.177806 1.433673 -0.246529 -0.367718 0.253413 \n", "2 0.440548 0.047222 -0.448450 -0.367718 -0.669085 \n", "3 -1.588818 -0.722477 -0.246529 -0.979436 0.253413 \n", "4 1.493662 1.125819 0.157313 0.549858 0.253413 \n", "... ... ... ... ... ... \n", "410770 0.592011 0.355014 0.561156 1.008646 0.253413 \n", "410771 0.240478 0.392697 -0.650371 -0.979436 0.253413 \n", "410772 -1.936771 -0.688830 0.359235 0.855717 -0.669085 \n", "410773 -0.748366 -0.804077 -0.650371 0.702788 0.253413 \n", "410774 1.257769 -1.101815 -0.044608 0.091070 1.175911 \n", "\n", " num__area num__kitchen_area cat__region cat__building_type \\\n", "0 0.063735 -0.186285 20.0 1.0 \n", "1 -0.114293 -0.186285 70.0 1.0 \n", "2 -0.456947 -0.142544 15.0 3.0 \n", "3 -0.181292 -0.142544 18.0 1.0 \n", "4 0.615045 -0.011322 10.0 2.0 \n", "... ... ... ... ... \n", "410770 -0.079836 -0.092653 54.0 2.0 \n", "410771 -0.334434 -0.404989 45.0 3.0 \n", "410772 -0.456947 -0.142544 18.0 0.0 \n", "410773 1.365441 1.501833 52.0 3.0 \n", "410774 0.553789 -0.142544 14.0 1.0 \n", "\n", " cat__object_type ... afr__sqrt(area)*sqrt(kitchen_area) \\\n", "0 0.0 ... -0.153548 \n", "1 0.0 ... -0.267268 \n", "2 1.0 ... -0.454880 \n", "3 0.0 ... -0.254514 \n", "4 0.0 ... 0.438600 \n", "... ... ... ... \n", "410770 0.0 ... -0.120035 \n", "410771 0.0 ... -0.716150 \n", "410772 1.0 ... -0.454880 \n", "410773 0.0 ... 2.828890 \n", "410774 0.0 ... 0.202136 \n", "\n", " afr__sqrt(geo_lon)*sqrt(kitchen_area) afr__log(area) \\\n", "0 -0.490805 0.307835 \n", "1 0.615798 0.031907 \n", "2 -0.067183 -0.603122 \n", "3 -0.607607 -0.080304 \n", "4 0.891383 1.009612 \n", "... ... ... \n", "410770 0.237580 0.087725 \n", "410771 -0.510766 -0.357277 \n", "410772 -0.581851 -0.603122 \n", "410773 1.431272 1.729715 \n", "410774 -0.916054 0.940624 \n", "\n", " afr__rooms*log(level) afr__kitchen_area*rooms \\\n", "0 0.690329 -0.132529 \n", "1 0.282625 -0.132529 \n", "2 -0.512211 -0.487813 \n", "3 0.282625 -0.088119 \n", "4 0.574497 0.045112 \n", "... ... ... \n", "410770 0.792500 -0.037463 \n", "410771 -0.160491 -0.354582 \n", "410772 -0.211576 -0.487813 \n", "410773 -0.160491 1.581436 \n", "410774 1.217910 0.311575 \n", "\n", " afr__kitchen_area*levels afr__sqrt(geo_lon)*sqrt(level) \\\n", "0 -0.352834 0.323880 \n", "1 -0.418643 0.552794 \n", "2 -0.383803 -0.243092 \n", "3 -0.662523 -0.369355 \n", "4 0.208478 0.945981 \n", "... ... ... \n", "410770 0.322797 0.974381 \n", "410771 -0.778657 -0.406361 \n", "410772 0.173638 0.170166 \n", "410773 2.432437 -0.843150 \n", "410774 -0.174762 -0.415359 \n", "\n", " afr__area**(3/2) afr__geo_lat*log(kitchen_area) \\\n", "0 -0.008748 -0.031529 \n", "1 -0.056540 -0.143829 \n", "2 -0.140800 0.063464 \n", "3 -0.073838 -0.672113 \n", "4 0.154902 0.780855 \n", "... ... ... \n", "410770 -0.047496 0.243018 \n", "410771 -0.111897 -0.808157 \n", "410772 -0.140800 -0.798234 \n", "410773 0.411475 1.671069 \n", "410774 0.135617 0.359680 \n", "\n", " afr__geo_lat*log(geo_lon) \n", "0 0.068167 \n", "1 1.129118 \n", "2 0.460495 \n", "3 -1.481033 \n", "4 1.923382 \n", "... ... \n", "410770 0.789871 \n", "410771 0.574534 \n", "410772 -1.663294 \n", "410773 -1.052343 \n", "410774 -0.246790 \n", "\n", "[410775 rows x 36 columns]" ] }, "execution_count": 294, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.feature_selection import RFE\n", "X_train_afr" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "rfe_selector = RFE(estimator=regressor, n_features_to_select=12, step = 0.2) #drop 20% of features each iteration\n", "X_train_rfe = rfe_selector.fit_transform(X_train_afr,y_train)" ] }, { "cell_type": "code", "execution_count": 297, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num__geo_latnum__geo_lonafr__geo_lonafr__area*kitchen_areaafr__sqrt(area)*geo_latafr__sqrt(area)*log(level)afr__kitchen_area*log(level)afr__sqrt(area)*sqrt(kitchen_area)afr__rooms*log(level)afr__kitchen_area*roomsafr__sqrt(geo_lon)*sqrt(level)afr__geo_lat*log(geo_lon)
00.495902-0.449742-0.449742-0.1321880.3731510.6880760.044178-0.1535480.690329-0.1325290.3238800.068167
10.1778061.4336731.433673-0.1693700.0051140.071369-0.173647-0.2672680.282625-0.1325290.5527941.129118
20.4405480.0472220.047222-0.226261-0.425530-0.335537-0.239271-0.454880-0.512211-0.487813-0.2430920.460495
3-1.588818-0.722477-0.722477-0.165302-0.7232250.034116-0.129771-0.2545140.282625-0.088119-0.369355-1.481033
41.4936621.1258191.1258190.0943421.5222650.8627730.1944900.4386000.5744970.0451120.9459811.923382
.......................................
4107700.5920110.3550140.355014-0.1208410.2069260.7144990.226990-0.1200350.792500-0.0374630.9743810.789871
4107710.2404780.3926970.392697-0.296252-0.297209-0.551021-0.560144-0.716150-0.160491-0.354582-0.4063610.574534
410772-1.936771-0.688830-0.688830-0.226261-1.1927060.3062800.100868-0.454880-0.211576-0.4878130.170166-1.663294
410773-0.748366-0.804077-0.8040771.5701631.274445-0.0025210.7455072.828890-0.1604911.581436-0.843150-1.052343
4107741.257769-1.101815-1.101815-0.0027421.3389960.635065-0.0403020.2021361.2179100.311575-0.415359-0.246790
\n", "

410775 rows × 12 columns

\n", "
" ], "text/plain": [ " num__geo_lat num__geo_lon afr__geo_lon afr__area*kitchen_area \\\n", "0 0.495902 -0.449742 -0.449742 -0.132188 \n", "1 0.177806 1.433673 1.433673 -0.169370 \n", "2 0.440548 0.047222 0.047222 -0.226261 \n", "3 -1.588818 -0.722477 -0.722477 -0.165302 \n", "4 1.493662 1.125819 1.125819 0.094342 \n", "... ... ... ... ... \n", "410770 0.592011 0.355014 0.355014 -0.120841 \n", "410771 0.240478 0.392697 0.392697 -0.296252 \n", "410772 -1.936771 -0.688830 -0.688830 -0.226261 \n", "410773 -0.748366 -0.804077 -0.804077 1.570163 \n", "410774 1.257769 -1.101815 -1.101815 -0.002742 \n", "\n", " afr__sqrt(area)*geo_lat afr__sqrt(area)*log(level) \\\n", "0 0.373151 0.688076 \n", "1 0.005114 0.071369 \n", "2 -0.425530 -0.335537 \n", "3 -0.723225 0.034116 \n", "4 1.522265 0.862773 \n", "... ... ... \n", "410770 0.206926 0.714499 \n", "410771 -0.297209 -0.551021 \n", "410772 -1.192706 0.306280 \n", "410773 1.274445 -0.002521 \n", "410774 1.338996 0.635065 \n", "\n", " afr__kitchen_area*log(level) afr__sqrt(area)*sqrt(kitchen_area) \\\n", "0 0.044178 -0.153548 \n", "1 -0.173647 -0.267268 \n", "2 -0.239271 -0.454880 \n", "3 -0.129771 -0.254514 \n", "4 0.194490 0.438600 \n", "... ... ... \n", "410770 0.226990 -0.120035 \n", "410771 -0.560144 -0.716150 \n", "410772 0.100868 -0.454880 \n", "410773 0.745507 2.828890 \n", "410774 -0.040302 0.202136 \n", "\n", " afr__rooms*log(level) afr__kitchen_area*rooms \\\n", "0 0.690329 -0.132529 \n", "1 0.282625 -0.132529 \n", "2 -0.512211 -0.487813 \n", "3 0.282625 -0.088119 \n", "4 0.574497 0.045112 \n", "... ... ... \n", "410770 0.792500 -0.037463 \n", "410771 -0.160491 -0.354582 \n", "410772 -0.211576 -0.487813 \n", "410773 -0.160491 1.581436 \n", "410774 1.217910 0.311575 \n", "\n", " afr__sqrt(geo_lon)*sqrt(level) afr__geo_lat*log(geo_lon) \n", "0 0.323880 0.068167 \n", "1 0.552794 1.129118 \n", "2 -0.243092 0.460495 \n", "3 -0.369355 -1.481033 \n", "4 0.945981 1.923382 \n", "... ... ... \n", "410770 0.974381 0.789871 \n", "410771 -0.406361 0.574534 \n", "410772 0.170166 -1.663294 \n", "410773 -0.843150 -1.052343 \n", "410774 -0.415359 -0.246790 \n", "\n", "[410775 rows x 12 columns]" ] }, "execution_count": 297, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train_afr_rfe = pd.DataFrame(X_train_rfe, columns=rfe_selector.get_feature_names_out())\n", "X_train_afr_rfe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rfe_pipeline = Pipeline(steps=[\n", " ('preprocessor', preprocessor_afr), \n", " ('rfe_extractor', RFE(estimator=regressor, n_features_to_select=12, step = 0.2)),\n", " ('model', regressor)\n", "])\n", "\n", "rfe_pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 301, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'mae': 1431925.3203264712,\n", " 'mape': 1.239752923791043e+18,\n", " 'mse': 261947924998018.2}" ] }, "execution_count": 301, "metadata": {}, "output_type": "execute_result" } ], "source": [ "predictions_rfe = rfe_pipeline.predict(X_test)\n", "\n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions_rfe) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions_rfe)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions_rfe)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": 302, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 40.15it/s]\n", "2024/10/17 14:26:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run rfe_feature_selection at: http://127.0.0.1:5000/#/experiments/1/runs/96f0bbcd6d88466abcf38f3b53f06ff1.\n", "2024/10/17 14:26:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n" ] } ], "source": [ "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "RUN_NAME = 'rfe_feature_selection'\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(rfe_pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(model_sklearn.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Используем sklearn признаки\n", "Тут мы можем отобрать признаки один раз на обучении, а далее в качестве шага пайплайна использовать написанный класс ColumnExtractor для выбора нуных столбцов" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "rfe_skl_selector = RFE(estimator=regressor, n_features_to_select=12, step = 0.2) #drop 20% of features each iteration\n", "X_train_skl_rfe = rfe_skl_selector.fit_transform(X_train_sklearn,y_train)" ] }, { "cell_type": "code", "execution_count": 305, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num__geo_latnum__geo_lonnum__levelnum__roomsnum__kitchen_areacat__regionquantile__geo_latquantile__geo_lonquantile__levelpoly__area kitchen_areaspline__area_sp_0spline__area_sp_2
00.495902-0.4497420.3592350.253413-0.18628520.00.7662570.5110280.717217-0.1321880.1558060.178013
10.1778061.433673-0.2465290.253413-0.18628570.00.2971420.8679990.522022-0.1693700.1569210.176803
20.4405480.047222-0.448450-0.669085-0.14254415.00.7323300.6299840.417417-0.2262610.1590800.174488
3-1.588818-0.722477-0.2465290.253413-0.14254418.00.1487890.2952620.522022-0.1653020.1573410.176349
41.4936621.1258190.1573130.253413-0.01132210.00.9859370.7583630.6626630.0943420.1523900.181792
.......................................
4107700.5920110.3550140.5611560.253413-0.09265354.00.7883930.6867280.771271-0.1208410.1567050.177037
4107710.2404780.392697-0.6503710.253413-0.40498945.00.4940620.7172400.309810-0.2962520.1583060.175314
410772-1.936771-0.6888300.359235-0.669085-0.14254418.00.1313520.3276130.717217-0.2262610.1590800.174488
410773-0.748366-0.804077-0.6503710.2534131.50183352.00.1931430.1147530.3098101.5701630.1478200.187011
4107741.257769-1.101815-0.0446081.175911-0.14254414.00.9080360.0757250.604605-0.0027420.1527670.181370
\n", "

410775 rows × 12 columns

\n", "
" ], "text/plain": [ " num__geo_lat num__geo_lon num__level num__rooms num__kitchen_area \\\n", "0 0.495902 -0.449742 0.359235 0.253413 -0.186285 \n", "1 0.177806 1.433673 -0.246529 0.253413 -0.186285 \n", "2 0.440548 0.047222 -0.448450 -0.669085 -0.142544 \n", "3 -1.588818 -0.722477 -0.246529 0.253413 -0.142544 \n", "4 1.493662 1.125819 0.157313 0.253413 -0.011322 \n", "... ... ... ... ... ... \n", "410770 0.592011 0.355014 0.561156 0.253413 -0.092653 \n", "410771 0.240478 0.392697 -0.650371 0.253413 -0.404989 \n", "410772 -1.936771 -0.688830 0.359235 -0.669085 -0.142544 \n", "410773 -0.748366 -0.804077 -0.650371 0.253413 1.501833 \n", "410774 1.257769 -1.101815 -0.044608 1.175911 -0.142544 \n", "\n", " cat__region quantile__geo_lat quantile__geo_lon quantile__level \\\n", "0 20.0 0.766257 0.511028 0.717217 \n", "1 70.0 0.297142 0.867999 0.522022 \n", "2 15.0 0.732330 0.629984 0.417417 \n", "3 18.0 0.148789 0.295262 0.522022 \n", "4 10.0 0.985937 0.758363 0.662663 \n", "... ... ... ... ... \n", "410770 54.0 0.788393 0.686728 0.771271 \n", "410771 45.0 0.494062 0.717240 0.309810 \n", "410772 18.0 0.131352 0.327613 0.717217 \n", "410773 52.0 0.193143 0.114753 0.309810 \n", "410774 14.0 0.908036 0.075725 0.604605 \n", "\n", " poly__area kitchen_area spline__area_sp_0 spline__area_sp_2 \n", "0 -0.132188 0.155806 0.178013 \n", "1 -0.169370 0.156921 0.176803 \n", "2 -0.226261 0.159080 0.174488 \n", "3 -0.165302 0.157341 0.176349 \n", "4 0.094342 0.152390 0.181792 \n", "... ... ... ... \n", "410770 -0.120841 0.156705 0.177037 \n", "410771 -0.296252 0.158306 0.175314 \n", "410772 -0.226261 0.159080 0.174488 \n", "410773 1.570163 0.147820 0.187011 \n", "410774 -0.002742 0.152767 0.181370 \n", "\n", "[410775 rows x 12 columns]" ] }, "execution_count": 305, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train_skl_rfe = pd.DataFrame(X_train_skl_rfe, columns=rfe_skl_selector.get_feature_names_out())\n", "X_train_skl_rfe" ] }, { "cell_type": "code", "execution_count": 306, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['num__geo_lat',\n", " 'num__geo_lon',\n", " 'num__level',\n", " 'num__rooms',\n", " 'num__kitchen_area',\n", " 'cat__region',\n", " 'quantile__geo_lat',\n", " 'quantile__geo_lon',\n", " 'quantile__level',\n", " 'poly__area kitchen_area',\n", " 'spline__area_sp_0',\n", " 'spline__area_sp_2']" ] }, "execution_count": 306, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rfe_cols = X_train_skl_rfe.columns.tolist()\n", "rfe_cols" ] }, { "cell_type": "code", "execution_count": 307, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([ True, True, True, False, True, False, True, True, False,\n", " False, True, True, True, False, False, False, False, False,\n", " False, False, False, True, False, True, False, True, False,\n", " False])" ] }, "execution_count": 307, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rfe_idx = rfe_skl_selector.support_\n", "rfe_idx" ] }, { "cell_type": "code", "execution_count": 316, "metadata": {}, "outputs": [], "source": [ "# Отбираемые столбцы нужно залогировать, иначе мы потеряем информацию о том, какие призныки выбраны\n", "with open('rfe_skl_idx.txt', 'w+') as f:\n", " f.write(str(rfe_idx))\n", "with open('rfe_skl_cols.txt', 'w+') as f:\n", " f.write(str(rfe_cols))" ] }, { "cell_type": "code", "execution_count": 309, "metadata": {}, "outputs": [], "source": [ "class ColumnExtractor(object):\n", "\n", " def __init__(self, cols):\n", " self.cols = cols\n", "\n", " def transform(self, X):\n", " return X[:,self.cols]\n", " \n", " def fit(self, X, y=None):\n", " return self\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rfe_skl_pipeline = Pipeline(steps=[\n", " ('preprocessor', preprocessor_sklearn), \n", " ('rfe_extractor', ColumnExtractor(rfe_idx)),\n", " ('model', regressor)\n", "])\n", "\n", "rfe_skl_pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": 311, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 193.34it/s]\n", "2024/10/17 14:32:07 INFO mlflow.tracking._tracking_service.client: 🏃 View run rfe_skl_feature_selection at: http://127.0.0.1:5000/#/experiments/1/runs/e55206caeb1549e4aa0d98343d5c1d4d.\n", "2024/10/17 14:32:07 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1.\n" ] } ], "source": [ "predictions_rfe_skl = rfe_skl_pipeline.predict(X_test)\n", "\n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions_rfe_skl) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions_rfe_skl)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions_rfe_skl)\n", "\n", "metrics\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "RUN_NAME = 'rfe_skl_feature_selection'\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(rfe_pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact('rfe_skl_cols.txt')\n", " mlflow.log_artifact('rfe_skl_idx.txt')\n", " mlflow.log_params(model_sklearn.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## mlextend\n", "https://github.com/rasbt/mlxtend/blob/master/docs/sources/user_guide/feature_selection/SequentialFeatureSelector.ipynb " ] }, { "cell_type": "code", "execution_count": 312, "metadata": {}, "outputs": [], "source": [ "from mlxtend.feature_selection import SequentialFeatureSelector \n", "#from sklearn.feature_selection import SequentialFeatureSelector" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sfs = SequentialFeatureSelector(RandomForestRegressor(n_estimators=3), \n", " k_features=3,\n", " forward=True,\n", " floating=False, # True to drop selected features\n", " scoring='neg_mean_absolute_error',\n", " cv=2)\n", "\n", "sfs.fit(X_train_sklearn,y_train)" ] }, { "cell_type": "code", "execution_count": 314, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num__geo_lonquantile__geo_latspline__area_sp_3
0-0.4497420.7662571.826008e-06
11.4336730.2971421.310449e-06
20.0472220.7323306.098363e-07
3-0.7224770.1487891.144942e-06
41.1258190.9859374.240047e-06
............
4107700.3550140.7883931.401454e-06
4107710.3926970.4940628.202272e-07
410772-0.6888300.1313526.098363e-07
410773-0.8040770.1931431.004843e-05
410774-1.1018150.9080363.903343e-06
\n", "

410775 rows × 3 columns

\n", "
" ], "text/plain": [ " num__geo_lon quantile__geo_lat spline__area_sp_3\n", "0 -0.449742 0.766257 1.826008e-06\n", "1 1.433673 0.297142 1.310449e-06\n", "2 0.047222 0.732330 6.098363e-07\n", "3 -0.722477 0.148789 1.144942e-06\n", "4 1.125819 0.985937 4.240047e-06\n", "... ... ... ...\n", "410770 0.355014 0.788393 1.401454e-06\n", "410771 0.392697 0.494062 8.202272e-07\n", "410772 -0.688830 0.131352 6.098363e-07\n", "410773 -0.804077 0.193143 1.004843e-05\n", "410774 -1.101815 0.908036 3.903343e-06\n", "\n", "[410775 rows x 3 columns]" ] }, "execution_count": 314, "metadata": {}, "output_type": "execute_result" } ], "source": [ "selected_features_sfs = X_train_sklearn.loc[:, sfs.k_feature_names_]\n", "selected_features_sfs" ] }, { "cell_type": "code", "execution_count": 315, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['num__geo_lon', 'quantile__geo_lat', 'spline__area_sp_3']" ] }, "execution_count": 315, "metadata": {}, "output_type": "execute_result" } ], "source": [ "rfe_sfs_idx = list(sfs.k_feature_idx_)\n", "rfe_sfs_idx\n", "rfe_sfs_col = list(sfs.k_feature_names_)\n", "rfe_sfs_col" ] }, { "cell_type": "code", "execution_count": 317, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import matplotlib.pyplot as plt\n", "from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs\n", "\n", "fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev')\n", "\n", "plt.title('Sequential Forward Selection (w. StdDev)')\n", "plt.grid()\n", "plt.show()\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "rfe_sfs_pipeline = Pipeline(steps=[\n", " ('preprocessor', preprocessor_sklearn), \n", " ('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),\n", " ('model', regressor)\n", "])\n", "\n", "rfe_sfs_pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions_sfs = rfe_sfs_pipeline.predict(X_test)\n", "\n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions_sfs) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions_sfs)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions_sfs)\n", "\n", "metrics\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "RUN_NAME = 'rfe_sfs_feature_selection'\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(rfe_sfs_pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact('rfe_skl_cols.txt')\n", " mlflow.log_artifact('rfe_skl_idx.txt')\n", " mlflow.log_params(model_sklearn.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "Можно совмещать признаки, выбранные по sfs и sbs: брать их объединение или пересечение. Можно комбинировать с признаками, выделенными разными подходами - целое поле для исследований" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# HYPERPARAMS\n", "## Gridsearch" ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "param_grid = {\n", " 'model__depth': [1,3,5]\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gs = GridSearchCV(rfe_sfs_pipeline, param_grid, cv=2, scoring='neg_mean_absolute_error')\n", "gs.fit(X_train, y_train)\n", "print(\"Лучшие гиперпараметры:\", gs.best_params_)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "gs_pipeline = Pipeline(steps=[\n", " ('preprocessor', preprocessor_sklearn), \n", " ('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),\n", " ('model', CatBoostRegressor(depth=5))\n", "])\n", "\n", "# Проведем стандартную проверку на тестовом множестве и залогируем run" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Вместо GridSearch можно использовать RandomSearch" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Optuna" ] }, { "cell_type": "code", "execution_count": 292, "metadata": {}, "outputs": [], "source": [ "import optuna" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def objective(trial):\n", " # предлагаем гиперпараметры\n", " depth = trial.suggest_int('depth', 1, 10)\n", " learning_rate = trial.suggest_float('learning_rate', 0.001, 0.1)\n", "\n", " # создаём и обучаем модель\n", " opt_pipeline = Pipeline(steps=[\n", " ('preprocessor', preprocessor_sklearn), \n", " ('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),\n", " ('model', CatBoostRegressor(depth=depth, learning_rate=learning_rate, verbose=0))\n", " ])\n", "\n", " opt_pipeline.fit(X_train, y_train)\n", "\n", " # предсказываем и вычисляем RMSE\n", " preds = opt_pipeline.predict(X_test)\n", " mae = mean_absolute_error(y_test, preds) \n", "\n", " return mae" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "study = optuna.create_study(direction='minimize')\n", "study.optimize(objective, n_trials=10)\n", "\n", "# выводим результаты\n", "print('Number of finished trials:', len(study.trials))\n", "print('Best trial:', study.best_trial.params) " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "opt_pipeline = Pipeline(steps=[\n", " ('preprocessor', preprocessor_sklearn), \n", " ('rfe_extractor', ColumnExtractor(rfe_sfs_idx)),\n", " ('model', CatBoostRegressor(depth=3, learning_rate=0.02789))\n", "])\n", "\n", "# Проведем стандартную проверку на тестовом множестве и залогируем run" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Выбираем лучшую модель.\n", "Обучаем ее на всей выборке (а не только на train-части). \n", "Далее будем деплоить именно её" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv_labs_proj", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }