Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

1016 строки
31 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import mlflow\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"import pandas as pd\n",
"import numpy\n",
"\n",
"from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n",
"\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.compose import ColumnTransformer\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.linear_model import LinearRegression\n",
"from catboost import CatBoostRegressor\n",
"\n",
"from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns={'price': 'target'})\n",
"df = df.drop(columns=['date', 'time'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()\n",
"cat_features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"num_features = X_train.select_dtypes(include=['number']).columns.to_list()\n",
"num_features"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"s_scaler = StandardScaler()\n",
"l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом\n",
"regressor = CatBoostRegressor()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Column transformer"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Для удобной работы со столбцами\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" ('num', s_scaler, num_features), # преобразования для числовых признаков\n",
" ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
" ],\n",
" remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n",
" ('model', regressor)])\n",
"\n",
"pipeline.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predictions = pipeline.predict(X_test) \n",
"\n",
"metrics = {}\n",
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
"\n",
"metrics"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# Работаем с MLflow локально\n",
"TRACKING_SERVER_HOST = \"127.0.0.1\"\n",
"TRACKING_SERVER_PORT = 5000\n",
"\n",
"registry_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n",
"tracking_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n",
"\n",
"mlflow.set_tracking_uri(tracking_uri) \n",
"mlflow.set_registry_uri(registry_uri) \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться\n",
"EXPERIMENT_NAME = \"estate_project\"\n",
"RUN_NAME = \"baseline model\"\n",
"REGISTRY_MODEL_NAME = \"estate_model_rf\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Логируем вручную"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их\n",
"from mlflow.models import infer_signature\n",
"\n",
"signature = infer_signature(model_input = X_train.head(5))\n",
"input_example = X_train.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"# Будем логировать requirements и артефакт - текстовый файл\n",
"req_file = 'requirements.txt'\n",
"art = 'comment.txt'"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели\n",
"#params_dict = {'n_estimators': 10, 'max_depth': 10}\n",
"params_dict = pipeline.get_params()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Когда создаем новый эксперимент, то: \n",
"experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n",
"\n",
"# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n",
"#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
" # получаем уникальный идентификатор запуска эксперимента\n",
" run_id = run.info.run_id \n",
" mlflow.sklearn.log_model(pipeline, \n",
" artifact_path=\"models\",\n",
" signature=signature,\n",
" input_example=input_example,\n",
" pip_requirements=req_file\n",
" )\n",
" mlflow.log_metrics(metrics)\n",
" mlflow.log_artifact(art)\n",
" mlflow.log_params(params_dict)\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Удаление runs, experiments\n",
"\n",
"Использовать осторожно"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"#mlflow.delete_experiment(experiment_id)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mlflow.search_runs(\n",
" #experiment_ids=[experiment_id],\n",
" experiment_names=[EXPERIMENT_NAME],\n",
" # filter_string='status = \"FAILED\"'\n",
" #filter_string='metrics.mae > 1'\n",
" \n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"\n",
"#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Автологирование\n",
"После включения будет срабатывать на каждом обучении модели (на методе fit()).\n",
"\n",
"Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"mlflow.sklearn.autolog()\n",
"\n",
"with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:\n",
" pipeline.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# Отключаем автологирование\n",
"mlflow.sklearn.autolog(disable=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Model #2\n",
"Обучим вторую \"маленькую\" модель\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n",
" ('model', regressor2)])\n",
"\n",
"pipeline.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predictions = pipeline.predict(X_test) \n",
"metrics = {}\n",
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
"\n",
"metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй \"маленькой\" модели. \n",
"\n",
"\n",
"RUN_NAME = 'smaller_model'\n",
"\n",
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
" # получаем уникальный идентификатор запуска эксперимента\n",
" run_id = run.info.run_id \n",
" mlflow.sklearn.log_model(pipeline, \n",
" artifact_path=\"models\",\n",
" signature=signature,\n",
" input_example=input_example,\n",
" pip_requirements=req_file\n",
" )\n",
" mlflow.log_metrics(metrics)\n",
" mlflow.log_artifact(art)\n",
" mlflow.log_params(pipeline.get_params())\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# No model\n",
"# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA\n",
"\n",
"RUN_NAME = 'no_model'\n",
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
" run_id = run.info.run_id \n",
" mlflow.log_artifact(art)\n",
"\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"run_id = '06fa7ec1f1b74aedb3509c88dc4ee1c0' # Указываем run id\n",
"mlflow.register_model(f\"runs:/{run_id}/models\", REGISTRY_MODEL_NAME)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Можно регистрировать сразу при создании прогона\n",
"\n",
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:\n",
" # получаем уникальный идентификатор запуска эксперимента\n",
" run_id = run.info.run_id \n",
" mlflow.sklearn.log_model(pipeline, \n",
" artifact_path=\"models\",\n",
" signature=signature,\n",
" input_example=input_example,\n",
" pip_requirements=req_file,\n",
" registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем\n",
" )\n",
" mlflow.log_metrics(metrics)\n",
" mlflow.log_artifact(art)\n",
" mlflow.log_params(pipeline.get_params())\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Можно найти зарегистрированные модели\n",
"model_reg = mlflow.search_registered_models()\n",
"model_reg[0]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"\n",
"model_name = REGISTRY_MODEL_NAME\n",
"model_version = 1\n",
"\n",
"model_loaded = mlflow.sklearn.load_model(model_uri=f\"models:/{model_name}/{model_version}\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_loaded.predict(X_test.iloc[0:1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_test.iloc[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Feature engineering"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sklearn"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"X_train_sklearn = X_train.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### PolynomialFeatures\n",
"Создает полином степени `degree` из указанных признаков\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"pf = PolynomialFeatures(degree=2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train_sklearn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pf.fit_transform(X_train_sklearn[['area','kitchen_area']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### SplineTransformer\n",
"Cоздаёт новую матрицу признаков, состоящую из сплайнов порядка degree. Количество сгенерированных сплайнов равно `n_splines=n_knots + degree - 1` для каждого признака, где\n",
"\n",
"`n_knots` определяет количество узлов (точек, в которых сопрягаются сплайны) для каждого признака. \n",
"\n",
"`degree` определяет порядок полинома, используемого для построения сплайнов. "
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"sp = SplineTransformer(n_knots=3, degree=3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sp.fit_transform(X_train_sklearn[['area']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### QuantileTransformer\n",
"Этот метод преобразует признаки, чтобы они распределялись равномерно или нормально — так данные меньше подвергаются влиянию выбросов. Преобразование применяется к каждому признаку независимо. Идея метода такова: оценить функцию распределения признака, чтобы преобразовать исходные значения в равномерное или нормальное распределение. \n",
"\n",
"`output_distribution='uniform'` или\n",
"`output_distribution='normal'` соответственно\n",
"\n",
"\n",
"Пример использования: если у вас есть данные о доходах с широким диапазоном значений, квантильное преобразование сделает их более сопоставимыми и устойчивыми к выбросам."
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"qt = QuantileTransformer()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"qt.fit_transform(X_train_sklearn[['area']])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Объединяем в ColumnTransformer и создаем Pipeline "
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"pf = PolynomialFeatures(degree=2)\n",
"qt = QuantileTransformer()\n",
"sp = SplineTransformer(n_knots=3, degree=3)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг\n",
"pf_pipeline = Pipeline(steps=[\n",
" ('poly', pf),\n",
" ('scale', StandardScaler())\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"preprocessor_sklearn = ColumnTransformer(\n",
" transformers=[\n",
" ('num', s_scaler, num_features), # преобразования для числовых признаков\n",
" ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
" ('quantile', qt,num_features),\n",
" ('poly', pf_pipeline, ['area', 'kitchen_area']), # В преобразования добавляем созданный ранее pipeline\n",
" ('spline', sp, ['area'])\n",
" ],\n",
" remainder='drop',\n",
" ) # Удаляем столбцы, которые не затронуты преобразования"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Посмотрим что из себя теперь представляет датафрейм"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"## не влезаем в float64 в полиномальном преобразовании. Использовать его нужно с умом!\n",
"X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')\n",
"X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn)\n",
"X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Удобно использовать для отображения всех строк\\столбцов в DataFrame\n",
"with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n",
" display (X_train_sklearn)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Создаем пайплайн с препроцессингом и моделью"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_sklearn = Pipeline(steps=[\n",
" ('transform', preprocessor_sklearn),\n",
" ('model', regressor)\n",
"])\n",
"\n",
"model_sklearn = pipeline_sklearn.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model_sklearn"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predictions = model_sklearn.predict(X_test) \n",
"metrics = {}\n",
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
"\n",
"metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"RUN_NAME = 'fe_sklearn'\n",
"\n",
"with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n",
" # получаем уникальный идентификатор запуска эксперимента\n",
" run_id = run.info.run_id \n",
" mlflow.sklearn.log_model(model_sklearn, \n",
" artifact_path=\"models\",\n",
" signature=signature,\n",
" input_example=input_example,\n",
" pip_requirements=req_file\n",
" )\n",
" mlflow.log_metrics(metrics)\n",
" mlflow.log_artifact(art)\n",
" mlflow.log_params(model_sklearn.get_params())\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Autofeat"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"from autofeat import AutoFeatRegressor\n",
"transformations = [\"1/\", \"exp\", \"log\", \"abs\", \"sqrt\", \"^2\", \"^3\", \"1+\", \"1-\", \"sin\", \"cos\", \"exp-\", \"2^\"] "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"afreg = AutoFeatRegressor(verbose=1, feateng_steps=2, max_gb=8, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)\n",
"X_train_arf = afreg.fit_transform(X_train,y_train)\n",
"X_train_arf"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"# Создаем обертку, в которой добавляем метод get_feature_names_out() для получения названий признаков\n",
"import numpy as np\n",
"\n",
"class AutoFeatWrapper():\n",
" def __init__(self, feateng_cols, feateng_steps=1, max_gb=16, transformations=[\"1/\", \"exp\", \"log\"], n_jobs=-1, verbose=1):\n",
" self.feateng_cols = feateng_cols\n",
" self.feateng_steps = feateng_steps\n",
" self.max_gb = max_gb\n",
" self.transformations = transformations\n",
" self.n_jobs = n_jobs\n",
" self.afreg = AutoFeatRegressor(feateng_cols=self.feateng_cols,\n",
" feateng_steps=self.feateng_steps,\n",
" max_gb=self.max_gb,\n",
" transformations=self.transformations,\n",
" n_jobs=self.n_jobs)\n",
" \n",
" def fit(self, X, y=None):\n",
" self.afreg.fit(X, y)\n",
" return self\n",
" \n",
" def transform(self, X):\n",
" return self.afreg.transform(X)\n",
" \n",
" def get_feature_names_out(self, input_features=None):\n",
" # Преобразуем данные и возвращаем имена фичей из DataFrame\n",
" transformed_X = self.afreg.transform(pd.DataFrame(np.zeros((1, len(self.feateng_cols))), columns=self.feateng_cols))\n",
" return transformed_X.columns.tolist()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"afreg_pipeline = Pipeline(steps=[\n",
" ('autofeat', AutoFeatWrapper( feateng_steps=2, max_gb=16, transformations=[\"log\", \"sqrt\"],feateng_cols=num_features)),\n",
" ('scaler', StandardScaler()),\n",
"])"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"preprocessor_afr = ColumnTransformer(\n",
" transformers=[\n",
" ('num', s_scaler, num_features), # преобразования для числовых признаков\n",
" ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n",
" ('afr', afreg_pipeline, num_features), # преобразования autofeat\n",
" ],\n",
" remainder='drop', # Удаляем столбцы, которые не затронуты преобразованиями\n",
" ) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X_train_afr_raw = preprocessor_afr.fit_transform(X_train,y_train)\n",
"X_train_afr = pd.DataFrame(X_train_afr_raw, columns=preprocessor_afr.get_feature_names_out())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with pd.option_context('display.max_rows', 5, 'display.max_columns', None):\n",
" display (X_train_afr)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pipeline_afr = Pipeline(steps=[('preprocessor', preprocessor_afr), \n",
" ('model', regressor)])\n",
"\n",
"pipeline_afr.fit(X_train, y_train)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"predictions = pipeline_afr.predict(X_test) \n",
"\n",
"metrics = {}\n",
"metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n",
"metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n",
"metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n",
"\n",
"metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n",
"\n",
"with mlflow.start_run(run_name='autofeat', experiment_id=experiment_id) as run:\n",
" # получаем уникальный идентификатор запуска эксперимента\n",
" run_id = run.info.run_id \n",
" mlflow.sklearn.log_model(pipeline_afr, \n",
" artifact_path=\"models\",\n",
" signature=signature,\n",
" input_example=input_example,\n",
" pip_requirements=req_file\n",
" )\n",
" mlflow.log_metrics(metrics)\n",
" mlflow.log_artifact(art)\n",
" mlflow.log_params(pipeline_afr.get_params())\n",
"\n",
"run = mlflow.get_run(run_id) \n",
"assert (run.info.status =='FINISHED')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv_labs_proj",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}