Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
1016 строки
31 KiB
Plaintext
1016 строки
31 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os\n",
|
|
"import mlflow\n",
|
|
"\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"import pandas as pd\n",
|
|
"import numpy\n",
|
|
"\n",
|
|
"from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n",
|
|
"\n",
|
|
"from sklearn.pipeline import Pipeline\n",
|
|
"from sklearn.compose import ColumnTransformer\n",
|
|
"from sklearn.ensemble import RandomForestRegressor\n",
|
|
"from sklearn.linear_model import LinearRegression\n",
|
|
"from catboost import CatBoostRegressor\n",
|
|
"\n",
|
|
"from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции\n",
|
|
"df.info()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df = df.rename(columns={'price': 'target'})\n",
|
|
"df = df.drop(columns=['date', 'time'])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"df"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()\n",
|
|
"cat_features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"num_features = X_train.select_dtypes(include=['number']).columns.to_list()\n",
|
|
"num_features"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 8,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"s_scaler = StandardScaler()\n",
|
|
"l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом\n",
|
|
"regressor = CatBoostRegressor()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Column transformer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Для удобной работы со столбцами\n",
|
|
"preprocessor = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', s_scaler, num_features), # преобразования для числовых признаков\n",
|
|
" ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
|
|
" ],\n",
|
|
" remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n",
|
|
" ('model', regressor)])\n",
|
|
"\n",
|
|
"pipeline.fit(X_train, y_train)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"predictions = pipeline.predict(X_test) \n",
|
|
"\n",
|
|
"metrics = {}\n",
|
|
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
|
|
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
|
|
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
|
|
"\n",
|
|
"metrics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"# Работаем с MLflow локально\n",
|
|
"TRACKING_SERVER_HOST = \"127.0.0.1\"\n",
|
|
"TRACKING_SERVER_PORT = 5000\n",
|
|
"\n",
|
|
"registry_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n",
|
|
"tracking_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n",
|
|
"\n",
|
|
"mlflow.set_tracking_uri(tracking_uri) \n",
|
|
"mlflow.set_registry_uri(registry_uri) \n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 13,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться\n",
|
|
"EXPERIMENT_NAME = \"estate_project\"\n",
|
|
"RUN_NAME = \"baseline model\"\n",
|
|
"REGISTRY_MODEL_NAME = \"estate_model_rf\""
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Логируем вручную"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их\n",
|
|
"from mlflow.models import infer_signature\n",
|
|
"\n",
|
|
"signature = infer_signature(model_input = X_train.head(5))\n",
|
|
"input_example = X_train.head(5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 15,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Будем логировать requirements и артефакт - текстовый файл\n",
|
|
"req_file = 'requirements.txt'\n",
|
|
"art = 'comment.txt'"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 16,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели\n",
|
|
"#params_dict = {'n_estimators': 10, 'max_depth': 10}\n",
|
|
"params_dict = pipeline.get_params()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Когда создаем новый эксперимент, то: \n",
|
|
"experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n",
|
|
"\n",
|
|
"# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n",
|
|
"#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
|
|
"\n",
|
|
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
|
|
" # получаем уникальный идентификатор запуска эксперимента\n",
|
|
" run_id = run.info.run_id \n",
|
|
" mlflow.sklearn.log_model(pipeline, \n",
|
|
" artifact_path=\"models\",\n",
|
|
" signature=signature,\n",
|
|
" input_example=input_example,\n",
|
|
" pip_requirements=req_file\n",
|
|
" )\n",
|
|
" mlflow.log_metrics(metrics)\n",
|
|
" mlflow.log_artifact(art)\n",
|
|
" mlflow.log_params(params_dict)\n",
|
|
"\n",
|
|
"run = mlflow.get_run(run_id) \n",
|
|
"assert (run.info.status =='FINISHED')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Удаление runs, experiments\n",
|
|
"\n",
|
|
"Использовать осторожно"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
|
|
"#mlflow.delete_experiment(experiment_id)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mlflow.search_runs(\n",
|
|
" #experiment_ids=[experiment_id],\n",
|
|
" experiment_names=[EXPERIMENT_NAME],\n",
|
|
" # filter_string='status = \"FAILED\"'\n",
|
|
" #filter_string='metrics.mae > 1'\n",
|
|
" \n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 26,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Автологирование\n",
|
|
"После включения будет срабатывать на каждом обучении модели (на методе fit()).\n",
|
|
"\n",
|
|
"Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"mlflow.sklearn.autolog()\n",
|
|
"\n",
|
|
"with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:\n",
|
|
" pipeline.fit(X_train, y_train)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 22,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Отключаем автологирование\n",
|
|
"mlflow.sklearn.autolog(disable=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Model #2\n",
|
|
"Обучим вторую \"маленькую\" модель\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n",
|
|
" ('model', regressor2)])\n",
|
|
"\n",
|
|
"pipeline.fit(X_train, y_train)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"predictions = pipeline.predict(X_test) \n",
|
|
"metrics = {}\n",
|
|
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
|
|
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
|
|
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
|
|
"\n",
|
|
"metrics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй \"маленькой\" модели. \n",
|
|
"\n",
|
|
"\n",
|
|
"RUN_NAME = 'smaller_model'\n",
|
|
"\n",
|
|
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
|
|
"\n",
|
|
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
|
|
" # получаем уникальный идентификатор запуска эксперимента\n",
|
|
" run_id = run.info.run_id \n",
|
|
" mlflow.sklearn.log_model(pipeline, \n",
|
|
" artifact_path=\"models\",\n",
|
|
" signature=signature,\n",
|
|
" input_example=input_example,\n",
|
|
" pip_requirements=req_file\n",
|
|
" )\n",
|
|
" mlflow.log_metrics(metrics)\n",
|
|
" mlflow.log_artifact(art)\n",
|
|
" mlflow.log_params(pipeline.get_params())\n",
|
|
"\n",
|
|
"run = mlflow.get_run(run_id) \n",
|
|
"assert (run.info.status =='FINISHED')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# No model\n",
|
|
"# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA\n",
|
|
"\n",
|
|
"RUN_NAME = 'no_model'\n",
|
|
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
|
|
"\n",
|
|
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
|
|
" run_id = run.info.run_id \n",
|
|
" mlflow.log_artifact(art)\n",
|
|
"\n",
|
|
"\n",
|
|
"run = mlflow.get_run(run_id) \n",
|
|
"assert (run.info.status =='FINISHED')\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"run_id = '06fa7ec1f1b74aedb3509c88dc4ee1c0' # Указываем run id\n",
|
|
"mlflow.register_model(f\"runs:/{run_id}/models\", REGISTRY_MODEL_NAME)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Можно регистрировать сразу при создании прогона\n",
|
|
"\n",
|
|
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
|
|
"\n",
|
|
"with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:\n",
|
|
" # получаем уникальный идентификатор запуска эксперимента\n",
|
|
" run_id = run.info.run_id \n",
|
|
" mlflow.sklearn.log_model(pipeline, \n",
|
|
" artifact_path=\"models\",\n",
|
|
" signature=signature,\n",
|
|
" input_example=input_example,\n",
|
|
" pip_requirements=req_file,\n",
|
|
" registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем\n",
|
|
" )\n",
|
|
" mlflow.log_metrics(metrics)\n",
|
|
" mlflow.log_artifact(art)\n",
|
|
" mlflow.log_params(pipeline.get_params())\n",
|
|
"\n",
|
|
"run = mlflow.get_run(run_id) \n",
|
|
"assert (run.info.status =='FINISHED')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Можно найти зарегистрированные модели\n",
|
|
"model_reg = mlflow.search_registered_models()\n",
|
|
"model_reg[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 32,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"model_name = REGISTRY_MODEL_NAME\n",
|
|
"model_version = 1\n",
|
|
"\n",
|
|
"model_loaded = mlflow.sklearn.load_model(model_uri=f\"models:/{model_name}/{model_version}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model_loaded.predict(X_test.iloc[0:1])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"y_test.iloc[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Feature engineering"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Sklearn"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 35,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 36,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train_sklearn = X_train.copy()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### PolynomialFeatures\n",
|
|
"Создает полином степени `degree` из указанных признаков\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 37,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pf = PolynomialFeatures(degree=2)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train_sklearn"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pf.fit_transform(X_train_sklearn[['area','kitchen_area']])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### SplineTransformer\n",
|
|
"Cоздаёт новую матрицу признаков, состоящую из сплайнов порядка degree. Количество сгенерированных сплайнов равно `n_splines=n_knots + degree - 1` для каждого признака, где\n",
|
|
"\n",
|
|
"`n_knots` определяет количество узлов (точек, в которых сопрягаются сплайны) для каждого признака. \n",
|
|
"\n",
|
|
"`degree` определяет порядок полинома, используемого для построения сплайнов. "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 43,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sp = SplineTransformer(n_knots=3, degree=3)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"sp.fit_transform(X_train_sklearn[['area']])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"#### QuantileTransformer\n",
|
|
"Этот метод преобразует признаки, чтобы они распределялись равномерно или нормально — так данные меньше подвергаются влиянию выбросов. Преобразование применяется к каждому признаку независимо. Идея метода такова: оценить функцию распределения признака, чтобы преобразовать исходные значения в равномерное или нормальное распределение. \n",
|
|
"\n",
|
|
"`output_distribution='uniform'` или\n",
|
|
"`output_distribution='normal'` соответственно\n",
|
|
"\n",
|
|
"\n",
|
|
"Пример использования: если у вас есть данные о доходах с широким диапазоном значений, квантильное преобразование сделает их более сопоставимыми и устойчивыми к выбросам."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 47,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"qt = QuantileTransformer()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"qt.fit_transform(X_train_sklearn[['area']])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Объединяем в ColumnTransformer и создаем Pipeline "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pf = PolynomialFeatures(degree=2)\n",
|
|
"qt = QuantileTransformer()\n",
|
|
"sp = SplineTransformer(n_knots=3, degree=3)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 50,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг\n",
|
|
"pf_pipeline = Pipeline(steps=[\n",
|
|
" ('poly', pf),\n",
|
|
" ('scale', StandardScaler())\n",
|
|
"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 51,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"preprocessor_sklearn = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', s_scaler, num_features), # преобразования для числовых признаков\n",
|
|
" ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
|
|
" ('quantile', qt,num_features),\n",
|
|
" ('poly', pf_pipeline, ['area', 'kitchen_area']), # В преобразования добавляем созданный ранее pipeline\n",
|
|
" ('spline', sp, ['area'])\n",
|
|
" ],\n",
|
|
" remainder='drop',\n",
|
|
" ) # Удаляем столбцы, которые не затронуты преобразования"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Посмотрим что из себя теперь представляет датафрейм"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 56,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"## не влезаем в float64 в полиномальном преобразовании. Использовать его нужно с умом!\n",
|
|
"X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')\n",
|
|
"X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 58,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn)\n",
|
|
"X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Удобно использовать для отображения всех строк\\столбцов в DataFrame\n",
|
|
"with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n",
|
|
" display (X_train_sklearn)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Создаем пайплайн с препроцессингом и моделью"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pipeline_sklearn = Pipeline(steps=[\n",
|
|
" ('transform', preprocessor_sklearn),\n",
|
|
" ('model', regressor)\n",
|
|
"])\n",
|
|
"\n",
|
|
"model_sklearn = pipeline_sklearn.fit(X_train, y_train)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"model_sklearn"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"predictions = model_sklearn.predict(X_test) \n",
|
|
"metrics = {}\n",
|
|
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
|
|
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
|
|
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
|
|
"\n",
|
|
"metrics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
|
|
"RUN_NAME = 'fe_sklearn'\n",
|
|
"\n",
|
|
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
|
|
" # получаем уникальный идентификатор запуска эксперимента\n",
|
|
" run_id = run.info.run_id \n",
|
|
" mlflow.sklearn.log_model(model_sklearn, \n",
|
|
" artifact_path=\"models\",\n",
|
|
" signature=signature,\n",
|
|
" input_example=input_example,\n",
|
|
" pip_requirements=req_file\n",
|
|
" )\n",
|
|
" mlflow.log_metrics(metrics)\n",
|
|
" mlflow.log_artifact(art)\n",
|
|
" mlflow.log_params(model_sklearn.get_params())\n",
|
|
"\n",
|
|
"run = mlflow.get_run(run_id) \n",
|
|
"assert (run.info.status =='FINISHED')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Autofeat"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 65,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from autofeat import AutoFeatRegressor\n",
|
|
"transformations = [\"1/\", \"exp\", \"log\", \"abs\", \"sqrt\", \"^2\", \"^3\", \"1+\", \"1-\", \"sin\", \"cos\", \"exp-\", \"2^\"] "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"afreg = AutoFeatRegressor(verbose=1, feateng_steps=2, max_gb=8, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)\n",
|
|
"X_train_arf = afreg.fit_transform(X_train,y_train)\n",
|
|
"X_train_arf"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 67,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Создаем обертку, в которой добавляем метод get_feature_names_out() для получения названий признаков\n",
|
|
"import numpy as np\n",
|
|
"\n",
|
|
"class AutoFeatWrapper():\n",
|
|
" def __init__(self, feateng_cols, feateng_steps=1, max_gb=16, transformations=[\"1/\", \"exp\", \"log\"], n_jobs=-1, verbose=1):\n",
|
|
" self.feateng_cols = feateng_cols\n",
|
|
" self.feateng_steps = feateng_steps\n",
|
|
" self.max_gb = max_gb\n",
|
|
" self.transformations = transformations\n",
|
|
" self.n_jobs = n_jobs\n",
|
|
" self.afreg = AutoFeatRegressor(feateng_cols=self.feateng_cols,\n",
|
|
" feateng_steps=self.feateng_steps,\n",
|
|
" max_gb=self.max_gb,\n",
|
|
" transformations=self.transformations,\n",
|
|
" n_jobs=self.n_jobs)\n",
|
|
" \n",
|
|
" def fit(self, X, y=None):\n",
|
|
" self.afreg.fit(X, y)\n",
|
|
" return self\n",
|
|
" \n",
|
|
" def transform(self, X):\n",
|
|
" return self.afreg.transform(X)\n",
|
|
" \n",
|
|
" def get_feature_names_out(self, input_features=None):\n",
|
|
" # Преобразуем данные и возвращаем имена фичей из DataFrame\n",
|
|
" transformed_X = self.afreg.transform(pd.DataFrame(np.zeros((1, len(self.feateng_cols))), columns=self.feateng_cols))\n",
|
|
" return transformed_X.columns.tolist()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 69,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"afreg_pipeline = Pipeline(steps=[\n",
|
|
" ('autofeat', AutoFeatWrapper( feateng_steps=2, max_gb=16, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)),\n",
|
|
" ('scaler', StandardScaler()),\n",
|
|
"])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 70,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"preprocessor_afr = ColumnTransformer(\n",
|
|
" transformers=[\n",
|
|
" ('num', s_scaler, num_features), # преобразования для числовых признаков\n",
|
|
" ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
|
|
" ('afr', afreg_pipeline, num_features), # преобразования autofeat\n",
|
|
" ],\n",
|
|
" remainder='drop', # Удаляем столбцы, которые не затронуты преобразованиями\n",
|
|
" ) "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X_train_afr_raw = preprocessor_afr.fit_transform(X_train,y_train)\n",
|
|
"X_train_afr = pd.DataFrame(X_train_afr_raw, columns=preprocessor_afr.get_feature_names_out())"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n",
|
|
" display (X_train_afr)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"pipeline_afr = Pipeline(steps=[('preprocessor', preprocessor_afr), \n",
|
|
" ('model', regressor)])\n",
|
|
"\n",
|
|
"pipeline_afr.fit(X_train, y_train)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"predictions = pipeline_afr.predict(X_test) \n",
|
|
"\n",
|
|
"metrics = {}\n",
|
|
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
|
|
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
|
|
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
|
|
"\n",
|
|
"metrics"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"\n",
|
|
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
|
|
"\n",
|
|
"with mlflow.start_run(run_name='autofeat', experiment_id=experiment_id) as run:\n",
|
|
" # получаем уникальный идентификатор запуска эксперимента\n",
|
|
" run_id = run.info.run_id \n",
|
|
" mlflow.sklearn.log_model(pipeline_afr, \n",
|
|
" artifact_path=\"models\",\n",
|
|
" signature=signature,\n",
|
|
" input_example=input_example,\n",
|
|
" pip_requirements=req_file\n",
|
|
" )\n",
|
|
" mlflow.log_metrics(metrics)\n",
|
|
" mlflow.log_artifact(art)\n",
|
|
" mlflow.log_params(pipeline_afr.get_params())\n",
|
|
"\n",
|
|
"run = mlflow.get_run(run_id) \n",
|
|
"assert (run.info.status =='FINISHED')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": ".venv_labs_proj",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.12"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|