{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import mlflow\n", "\n", "from sklearn.model_selection import train_test_split\n", "import pandas as pd\n", "import numpy\n", "\n", "from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder\n", "\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.linear_model import LinearRegression\n", "from catboost import CatBoostRegressor\n", "\n", "from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, mean_squared_error" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle('data/clean_data.pkl').sample(frac=0.1, random_state = 2) # Уменьшаем размер чтобы модель быстрее обучалась на лекции\n", "df.info()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = df.rename(columns={'price': 'target'})\n", "df = df.drop(columns=['date', 'time'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "X_train, X_test, y_train, y_test = train_test_split(df.drop('target', axis=1), df['target'], test_size=0.25, random_state=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cat_features = X_train.select_dtypes(include=['category','object']).columns.to_list()\n", "cat_features" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "num_features = X_train.select_dtypes(include=['number']).columns.to_list()\n", "num_features" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "https://scikit-learn.org/stable/api/sklearn.preprocessing.html - разные способы кодирования и скалирования" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "s_scaler = StandardScaler()\n", "l_encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999999) # unknown_value нужно выбирать с умом\n", "regressor = CatBoostRegressor()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Column transformer" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Для удобной работы со столбцами\n", "preprocessor = ColumnTransformer(\n", " transformers=[\n", " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", " ],\n", " remainder='drop' ) # Удаляем столбцы, которые не затронуты преобразования" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n", " ('model', regressor)])\n", "\n", "pipeline.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = pipeline.predict(X_test) \n", "\n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "\n", "# Работаем с MLflow локально\n", "TRACKING_SERVER_HOST = \"127.0.0.1\"\n", "TRACKING_SERVER_PORT = 5000\n", "\n", "registry_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n", "tracking_uri = f\"http://{TRACKING_SERVER_HOST}:{TRACKING_SERVER_PORT}\"\n", "\n", "mlflow.set_tracking_uri(tracking_uri) \n", "mlflow.set_registry_uri(registry_uri) \n", "\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# название тестового эксперимента, запуска (run) внутри него, имени, под которым модель будет регистрироваться\n", "EXPERIMENT_NAME = \"estate_project\"\n", "RUN_NAME = \"baseline model\"\n", "REGISTRY_MODEL_NAME = \"estate_model_rf\"" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Логируем вручную" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Обязательно логируем сигнатуру модели и пример входных данных. Подготовим их\n", "from mlflow.models import infer_signature\n", "\n", "signature = infer_signature(model_input = X_train.head(5))\n", "input_example = X_train.head(5)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Будем логировать requirements и артефакт - текстовый файл\n", "req_file = 'requirements.txt'\n", "art = 'comment.txt'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "# Параметры, котороые будут залогированы, можем задавать вручную или полностью взять из модели\n", "#params_dict = {'n_estimators': 10, 'max_depth': 10}\n", "params_dict = pipeline.get_params()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Когда создаем новый эксперимент, то: \n", "experiment_id = mlflow.create_experiment(EXPERIMENT_NAME)\n", "\n", "# Впоследствии. чтобы добавлять запуски в этот же эксепримент мы должны получить его id:\n", "#experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(params_dict)\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Удаление runs, experiments\n", "\n", "Использовать осторожно" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "#mlflow.delete_experiment(experiment_id)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mlflow.search_runs(\n", " #experiment_ids=[experiment_id],\n", " experiment_names=[EXPERIMENT_NAME],\n", " # filter_string='status = \"FAILED\"'\n", " #filter_string='metrics.mae > 1'\n", " \n", ")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "\n", "#mlflow.delete_run('74d2a7a40c07413c9cf65df841164356')" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Автологирование\n", "После включения будет срабатывать на каждом обучении модели (на методе fit()).\n", "\n", "Есть плюсы, есть и минусы. Предлагается сделать прогон и сравнить с результатами вручную " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mlflow.sklearn.autolog()\n", "\n", "with mlflow.start_run(run_name='auto', experiment_id=experiment_id) as run:\n", " pipeline.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "# Отключаем автологирование\n", "mlflow.sklearn.autolog(disable=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Model #2\n", "Обучим вторую \"маленькую\" модель\n" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "regressor2 = RandomForestRegressor(n_estimators=10, max_depth=6)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(steps=[('preprocessor', preprocessor), \n", " ('model', regressor2)])\n", "\n", "pipeline.fit(X_train, y_train)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "predictions = pipeline.predict(X_test) \n", "metrics = {}\n", "metrics[\"mae\"] = mean_absolute_error(y_test, predictions) \n", "metrics[\"mape\"] = mean_absolute_percentage_error(y_test, predictions)\n", "metrics[\"mse\"] = mean_squared_error(y_test, predictions)\n", "\n", "metrics" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# !!! Проверить название прогона а также все логируемые параметры и артефакты, что они соответствуют второй \"маленькой\" модели. \n", "\n", "\n", "RUN_NAME = 'smaller_model'\n", "\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(pipeline.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# No model\n", "# Логировать можно только артефакты, без модели. Например, залогироавть графики после этапа EDA\n", "\n", "RUN_NAME = 'no_model'\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name=RUN_NAME, experiment_id=experiment_id) as run:\n", " run_id = run.info.run_id \n", " mlflow.log_artifact(art)\n", "\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "run_id = '06fa7ec1f1b74aedb3509c88dc4ee1c0' # Указываем run id\n", "mlflow.register_model(f\"runs:/{run_id}/models\", REGISTRY_MODEL_NAME)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Можно регистрировать сразу при создании прогона\n", "\n", "experiment_id = mlflow.get_experiment_by_name(EXPERIMENT_NAME).experiment_id\n", "\n", "with mlflow.start_run(run_name='register_at_run', experiment_id=experiment_id) as run:\n", " # получаем уникальный идентификатор запуска эксперимента\n", " run_id = run.info.run_id \n", " mlflow.sklearn.log_model(pipeline, \n", " artifact_path=\"models\",\n", " signature=signature,\n", " input_example=input_example,\n", " pip_requirements=req_file,\n", " registered_model_name = REGISTRY_MODEL_NAME # Указываем для какой модели регистрируем\n", " )\n", " mlflow.log_metrics(metrics)\n", " mlflow.log_artifact(art)\n", " mlflow.log_params(pipeline.get_params())\n", "\n", "run = mlflow.get_run(run_id) \n", "assert (run.info.status =='FINISHED')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Можно найти зарегистрированные модели\n", "model_reg = mlflow.search_registered_models()\n", "model_reg[0]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "\n", "model_name = REGISTRY_MODEL_NAME\n", "model_version = 1\n", "\n", "model_loaded = mlflow.sklearn.load_model(model_uri=f\"models:/{model_name}/{model_version}\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_loaded.predict(X_test.iloc[0:1])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "y_test.iloc[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Feature engineering" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sklearn" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import QuantileTransformer, SplineTransformer, PolynomialFeatures, MinMaxScaler" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn = X_train.copy()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### PolynomialFeatures\n", "Создает полином степени `degree` из указанных признаков\n" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "pf = PolynomialFeatures(degree=2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pf.fit_transform(X_train_sklearn[['area','kitchen_area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### SplineTransformer\n", "Cоздаёт новую матрицу признаков, состоящую из сплайнов порядка degree. Количество сгенерированных сплайнов равно `n_splines=n_knots + degree - 1` для каждого признака, где\n", "\n", "`n_knots` определяет количество узлов (точек, в которых сопрягаются сплайны) для каждого признака. \n", "\n", "`degree` определяет порядок полинома, используемого для построения сплайнов. " ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "sp = SplineTransformer(n_knots=3, degree=3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sp.fit_transform(X_train_sklearn[['area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### QuantileTransformer\n", "Этот метод преобразует признаки, чтобы они распределялись равномерно или нормально — так данные меньше подвергаются влиянию выбросов. Преобразование применяется к каждому признаку независимо. Идея метода такова: оценить функцию распределения признака, чтобы преобразовать исходные значения в равномерное или нормальное распределение. \n", "\n", "`output_distribution='uniform'` или\n", "`output_distribution='normal'` соответственно\n", "\n", "\n", "Пример использования: если у вас есть данные о доходах с широким диапазоном значений, квантильное преобразование сделает их более сопоставимыми и устойчивыми к выбросам." ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "qt = QuantileTransformer()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "qt.fit_transform(X_train_sklearn[['area']])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Объединяем в ColumnTransformer и создаем Pipeline " ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "pf = PolynomialFeatures(degree=2)\n", "qt = QuantileTransformer()\n", "sp = SplineTransformer(n_knots=3, degree=3)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "# Значения преобразованных признаков нужно отскейлить, поэтому создаем pipeline из двух шагов - преобразование и скейлинг\n", "pf_pipeline = Pipeline(steps=[\n", " ('poly', pf),\n", " ('scale', StandardScaler())\n", "])" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "preprocessor_sklearn = ColumnTransformer(\n", " transformers=[\n", " ('num', s_scaler, num_features), # преобразования для числовых признаков\n", " ('cat', l_encoder, cat_features), # преобразования для категориальных признаков\n", " ('quantile', qt,num_features),\n", " ('poly', pf_pipeline, ['area', 'kitchen_area']), # В преобразования добавляем созданный ранее pipeline\n", " ('spline', sp, ['area'])\n", " ],\n", " remainder='drop',\n", " ) # Удаляем столбцы, которые не затронуты преобразования" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Посмотрим что из себя теперь представляет датафрейм" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [], "source": [ "## не влезаем в float64 в полиномальном преобразовании. Использовать его нужно с умом!\n", "X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')\n", "X_train_sklearn[['area', 'kitchen_area']] = X_train_sklearn[['area', 'kitchen_area']].astype('float128')" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "X_train_sklearn_raw = preprocessor_sklearn.fit_transform(X_train_sklearn)\n", "X_train_sklearn = pd.DataFrame(X_train_sklearn_raw, columns=preprocessor_sklearn.get_feature_names_out())" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | num__geo_lat | \n", "num__geo_lon | \n", "num__level | \n", "num__levels | \n", "num__rooms | \n", "num__area | \n", "num__kitchen_area | \n", "cat__region | \n", "cat__building_type | \n", "cat__object_type | \n", "quantile__geo_lat | \n", "quantile__geo_lon | \n", "quantile__level | \n", "quantile__levels | \n", "quantile__rooms | \n", "quantile__area | \n", "quantile__kitchen_area | \n", "poly__1 | \n", "poly__area | \n", "poly__kitchen_area | \n", "poly__area^2 | \n", "poly__area kitchen_area | \n", "poly__kitchen_area^2 | \n", "spline__area_sp_0 | \n", "spline__area_sp_1 | \n", "spline__area_sp_2 | \n", "spline__area_sp_3 | \n", "spline__area_sp_4 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.495902 | \n", "-0.449742 | \n", "0.359235 | \n", "-0.214789 | \n", "0.253413 | \n", "0.063735 | \n", "-0.186285 | \n", "20.0 | \n", "1.0 | \n", "0.0 | \n", "0.766257 | \n", "0.511028 | \n", "0.717217 | \n", "0.536537 | \n", "0.600601 | \n", "0.623624 | \n", "0.374875 | \n", "0.0 | \n", "0.063735 | \n", "-0.186285 | \n", "-0.010002 | \n", "-0.132188 | \n", "-0.002792 | \n", "0.155806 | \n", "0.666179 | \n", "0.178013 | \n", "0.000002 | \n", "0.0 | \n", "
1 | \n", "0.177806 | \n", "1.433673 | \n", "-0.246529 | \n", "-0.367718 | \n", "0.253413 | \n", "-0.114293 | \n", "-0.186285 | \n", "70.0 | \n", "1.0 | \n", "0.0 | \n", "0.297142 | \n", "0.867999 | \n", "0.522022 | \n", "0.386887 | \n", "0.600601 | \n", "0.541542 | \n", "0.374875 | \n", "0.0 | \n", "-0.114293 | \n", "-0.186285 | \n", "-0.017375 | \n", "-0.169370 | \n", "-0.002792 | \n", "0.156921 | \n", "0.666275 | \n", "0.176803 | \n", "0.000001 | \n", "0.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
410773 | \n", "-0.748366 | \n", "-0.804077 | \n", "-0.650371 | \n", "0.702788 | \n", "0.253413 | \n", "1.365441 | \n", "1.501833 | \n", "52.0 | \n", "3.0 | \n", "0.0 | \n", "0.193143 | \n", "0.114753 | \n", "0.309810 | \n", "0.741742 | \n", "0.600601 | \n", "0.961367 | \n", "0.984535 | \n", "0.0 | \n", "1.365441 | \n", "1.501833 | \n", "0.068438 | \n", "1.570163 | \n", "0.008616 | \n", "0.147820 | \n", "0.665159 | \n", "0.187011 | \n", "0.000010 | \n", "0.0 | \n", "
410774 | \n", "1.257769 | \n", "-1.101815 | \n", "-0.044608 | \n", "0.091070 | \n", "1.175911 | \n", "0.553789 | \n", "-0.142544 | \n", "14.0 | \n", "1.0 | \n", "0.0 | \n", "0.908036 | \n", "0.075725 | \n", "0.604605 | \n", "0.645646 | \n", "0.867367 | \n", "0.841842 | \n", "0.436436 | \n", "0.0 | \n", "0.553789 | \n", "-0.142544 | \n", "0.014463 | \n", "-0.002742 | \n", "-0.002649 | \n", "0.152767 | \n", "0.665860 | \n", "0.181370 | \n", "0.000004 | \n", "0.0 | \n", "
410775 rows × 28 columns
\n", "\n", " | num__geo_lat | \n", "num__geo_lon | \n", "num__level | \n", "num__levels | \n", "num__rooms | \n", "num__area | \n", "num__kitchen_area | \n", "cat__region | \n", "cat__building_type | \n", "cat__object_type | \n", "afr__geo_lat | \n", "afr__geo_lon | \n", "afr__level | \n", "afr__levels | \n", "afr__rooms | \n", "afr__area | \n", "afr__kitchen_area | \n", "afr__area*rooms | \n", "afr__area*geo_lon | \n", "afr__levels*rooms | \n", "afr__area*kitchen_area | \n", "afr__sqrt(area)*geo_lat | \n", "afr__sqrt(area)*log(level) | \n", "afr__kitchen_area*log(level) | \n", "afr__sqrt(area)*kitchen_area | \n", "afr__geo_lon*log(kitchen_area) | \n", "afr__sqrt(area)*sqrt(kitchen_area) | \n", "afr__sqrt(geo_lon)*sqrt(kitchen_area) | \n", "afr__log(area) | \n", "afr__rooms*log(level) | \n", "afr__kitchen_area*rooms | \n", "afr__kitchen_area*levels | \n", "afr__sqrt(geo_lon)*sqrt(level) | \n", "afr__area**(3/2) | \n", "afr__geo_lat*log(kitchen_area) | \n", "afr__geo_lat*log(geo_lon) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.495902 | \n", "-0.449742 | \n", "0.359235 | \n", "-0.214789 | \n", "0.253413 | \n", "0.063735 | \n", "-0.186285 | \n", "20.0 | \n", "1.0 | \n", "0.0 | \n", "0.495902 | \n", "-0.449742 | \n", "0.359235 | \n", "-0.214789 | \n", "0.253413 | \n", "0.063735 | \n", "-0.186285 | \n", "0.006208 | \n", "-0.195129 | \n", "0.060916 | \n", "-0.132188 | \n", "0.373151 | \n", "0.688076 | \n", "0.044178 | \n", "-0.211335 | \n", "-0.481294 | \n", "-0.153548 | \n", "-0.490805 | \n", "0.307835 | \n", "0.690329 | \n", "-0.132529 | \n", "-0.352834 | \n", "0.323880 | \n", "-0.008748 | \n", "-0.031529 | \n", "0.068167 | \n", "
1 | \n", "0.177806 | \n", "1.433673 | \n", "-0.246529 | \n", "-0.367718 | \n", "0.253413 | \n", "-0.114293 | \n", "-0.186285 | \n", "70.0 | \n", "1.0 | \n", "0.0 | \n", "0.177806 | \n", "1.433673 | \n", "-0.246529 | \n", "-0.367718 | \n", "0.253413 | \n", "-0.114293 | \n", "-0.186285 | \n", "-0.083402 | \n", "0.655053 | \n", "-0.054279 | \n", "-0.169370 | \n", "0.005114 | \n", "0.071369 | \n", "-0.173647 | \n", "-0.252775 | \n", "1.191304 | \n", "-0.267268 | \n", "0.615798 | \n", "0.031907 | \n", "0.282625 | \n", "-0.132529 | \n", "-0.418643 | \n", "0.552794 | \n", "-0.056540 | \n", "-0.143829 | \n", "1.129118 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
410773 | \n", "-0.748366 | \n", "-0.804077 | \n", "-0.650371 | \n", "0.702788 | \n", "0.253413 | \n", "1.365441 | \n", "1.501833 | \n", "52.0 | \n", "3.0 | \n", "0.0 | \n", "-0.748366 | \n", "-0.804077 | \n", "-0.650371 | \n", "0.702788 | \n", "0.253413 | \n", "1.365441 | \n", "1.501833 | \n", "0.661427 | \n", "0.375199 | \n", "0.752088 | \n", "1.570163 | \n", "1.274445 | \n", "-0.002521 | \n", "0.745507 | \n", "2.382258 | \n", "0.071599 | \n", "2.828890 | \n", "1.431272 | \n", "1.729715 | \n", "-0.160491 | \n", "1.581436 | \n", "2.432437 | \n", "-0.843150 | \n", "0.411475 | \n", "1.671069 | \n", "-1.052343 | \n", "
410774 | \n", "1.257769 | \n", "-1.101815 | \n", "-0.044608 | \n", "0.091070 | \n", "1.175911 | \n", "0.553789 | \n", "-0.142544 | \n", "14.0 | \n", "1.0 | \n", "0.0 | \n", "1.257769 | \n", "-1.101815 | \n", "-0.044608 | \n", "0.091070 | \n", "1.175911 | \n", "0.553789 | \n", "-0.142544 | \n", "0.807887 | \n", "-0.330070 | \n", "0.982478 | \n", "-0.002742 | \n", "1.338996 | \n", "0.635065 | \n", "-0.040302 | \n", "-0.055435 | \n", "-1.025588 | \n", "0.202136 | \n", "-0.916054 | \n", "0.940624 | \n", "1.217910 | \n", "0.311575 | \n", "-0.174762 | \n", "-0.415359 | \n", "0.135617 | \n", "0.359680 | \n", "-0.246790 | \n", "
410775 rows × 36 columns
\n", "\n", " | num__geo_lat | \n", "num__geo_lon | \n", "num__level | \n", "num__levels | \n", "num__rooms | \n", "num__area | \n", "num__kitchen_area | \n", "cat__region | \n", "cat__building_type | \n", "cat__object_type | \n", "... | \n", "afr__sqrt(area)*sqrt(kitchen_area) | \n", "afr__sqrt(geo_lon)*sqrt(kitchen_area) | \n", "afr__log(area) | \n", "afr__rooms*log(level) | \n", "afr__kitchen_area*rooms | \n", "afr__kitchen_area*levels | \n", "afr__sqrt(geo_lon)*sqrt(level) | \n", "afr__area**(3/2) | \n", "afr__geo_lat*log(kitchen_area) | \n", "afr__geo_lat*log(geo_lon) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.495902 | \n", "-0.449742 | \n", "0.359235 | \n", "-0.214789 | \n", "0.253413 | \n", "0.063735 | \n", "-0.186285 | \n", "20.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "-0.153548 | \n", "-0.490805 | \n", "0.307835 | \n", "0.690329 | \n", "-0.132529 | \n", "-0.352834 | \n", "0.323880 | \n", "-0.008748 | \n", "-0.031529 | \n", "0.068167 | \n", "
1 | \n", "0.177806 | \n", "1.433673 | \n", "-0.246529 | \n", "-0.367718 | \n", "0.253413 | \n", "-0.114293 | \n", "-0.186285 | \n", "70.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "-0.267268 | \n", "0.615798 | \n", "0.031907 | \n", "0.282625 | \n", "-0.132529 | \n", "-0.418643 | \n", "0.552794 | \n", "-0.056540 | \n", "-0.143829 | \n", "1.129118 | \n", "
2 | \n", "0.440548 | \n", "0.047222 | \n", "-0.448450 | \n", "-0.367718 | \n", "-0.669085 | \n", "-0.456947 | \n", "-0.142544 | \n", "15.0 | \n", "3.0 | \n", "1.0 | \n", "... | \n", "-0.454880 | \n", "-0.067183 | \n", "-0.603122 | \n", "-0.512211 | \n", "-0.487813 | \n", "-0.383803 | \n", "-0.243092 | \n", "-0.140800 | \n", "0.063464 | \n", "0.460495 | \n", "
3 | \n", "-1.588818 | \n", "-0.722477 | \n", "-0.246529 | \n", "-0.979436 | \n", "0.253413 | \n", "-0.181292 | \n", "-0.142544 | \n", "18.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "-0.254514 | \n", "-0.607607 | \n", "-0.080304 | \n", "0.282625 | \n", "-0.088119 | \n", "-0.662523 | \n", "-0.369355 | \n", "-0.073838 | \n", "-0.672113 | \n", "-1.481033 | \n", "
4 | \n", "1.493662 | \n", "1.125819 | \n", "0.157313 | \n", "0.549858 | \n", "0.253413 | \n", "0.615045 | \n", "-0.011322 | \n", "10.0 | \n", "2.0 | \n", "0.0 | \n", "... | \n", "0.438600 | \n", "0.891383 | \n", "1.009612 | \n", "0.574497 | \n", "0.045112 | \n", "0.208478 | \n", "0.945981 | \n", "0.154902 | \n", "0.780855 | \n", "1.923382 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
410770 | \n", "0.592011 | \n", "0.355014 | \n", "0.561156 | \n", "1.008646 | \n", "0.253413 | \n", "-0.079836 | \n", "-0.092653 | \n", "54.0 | \n", "2.0 | \n", "0.0 | \n", "... | \n", "-0.120035 | \n", "0.237580 | \n", "0.087725 | \n", "0.792500 | \n", "-0.037463 | \n", "0.322797 | \n", "0.974381 | \n", "-0.047496 | \n", "0.243018 | \n", "0.789871 | \n", "
410771 | \n", "0.240478 | \n", "0.392697 | \n", "-0.650371 | \n", "-0.979436 | \n", "0.253413 | \n", "-0.334434 | \n", "-0.404989 | \n", "45.0 | \n", "3.0 | \n", "0.0 | \n", "... | \n", "-0.716150 | \n", "-0.510766 | \n", "-0.357277 | \n", "-0.160491 | \n", "-0.354582 | \n", "-0.778657 | \n", "-0.406361 | \n", "-0.111897 | \n", "-0.808157 | \n", "0.574534 | \n", "
410772 | \n", "-1.936771 | \n", "-0.688830 | \n", "0.359235 | \n", "0.855717 | \n", "-0.669085 | \n", "-0.456947 | \n", "-0.142544 | \n", "18.0 | \n", "0.0 | \n", "1.0 | \n", "... | \n", "-0.454880 | \n", "-0.581851 | \n", "-0.603122 | \n", "-0.211576 | \n", "-0.487813 | \n", "0.173638 | \n", "0.170166 | \n", "-0.140800 | \n", "-0.798234 | \n", "-1.663294 | \n", "
410773 | \n", "-0.748366 | \n", "-0.804077 | \n", "-0.650371 | \n", "0.702788 | \n", "0.253413 | \n", "1.365441 | \n", "1.501833 | \n", "52.0 | \n", "3.0 | \n", "0.0 | \n", "... | \n", "2.828890 | \n", "1.431272 | \n", "1.729715 | \n", "-0.160491 | \n", "1.581436 | \n", "2.432437 | \n", "-0.843150 | \n", "0.411475 | \n", "1.671069 | \n", "-1.052343 | \n", "
410774 | \n", "1.257769 | \n", "-1.101815 | \n", "-0.044608 | \n", "0.091070 | \n", "1.175911 | \n", "0.553789 | \n", "-0.142544 | \n", "14.0 | \n", "1.0 | \n", "0.0 | \n", "... | \n", "0.202136 | \n", "-0.916054 | \n", "0.940624 | \n", "1.217910 | \n", "0.311575 | \n", "-0.174762 | \n", "-0.415359 | \n", "0.135617 | \n", "0.359680 | \n", "-0.246790 | \n", "
410775 rows × 36 columns
\n", "\n", " | num__geo_lat | \n", "num__geo_lon | \n", "afr__geo_lon | \n", "afr__area*kitchen_area | \n", "afr__sqrt(area)*geo_lat | \n", "afr__sqrt(area)*log(level) | \n", "afr__kitchen_area*log(level) | \n", "afr__sqrt(area)*sqrt(kitchen_area) | \n", "afr__rooms*log(level) | \n", "afr__kitchen_area*rooms | \n", "afr__sqrt(geo_lon)*sqrt(level) | \n", "afr__geo_lat*log(geo_lon) | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.495902 | \n", "-0.449742 | \n", "-0.449742 | \n", "-0.132188 | \n", "0.373151 | \n", "0.688076 | \n", "0.044178 | \n", "-0.153548 | \n", "0.690329 | \n", "-0.132529 | \n", "0.323880 | \n", "0.068167 | \n", "
1 | \n", "0.177806 | \n", "1.433673 | \n", "1.433673 | \n", "-0.169370 | \n", "0.005114 | \n", "0.071369 | \n", "-0.173647 | \n", "-0.267268 | \n", "0.282625 | \n", "-0.132529 | \n", "0.552794 | \n", "1.129118 | \n", "
2 | \n", "0.440548 | \n", "0.047222 | \n", "0.047222 | \n", "-0.226261 | \n", "-0.425530 | \n", "-0.335537 | \n", "-0.239271 | \n", "-0.454880 | \n", "-0.512211 | \n", "-0.487813 | \n", "-0.243092 | \n", "0.460495 | \n", "
3 | \n", "-1.588818 | \n", "-0.722477 | \n", "-0.722477 | \n", "-0.165302 | \n", "-0.723225 | \n", "0.034116 | \n", "-0.129771 | \n", "-0.254514 | \n", "0.282625 | \n", "-0.088119 | \n", "-0.369355 | \n", "-1.481033 | \n", "
4 | \n", "1.493662 | \n", "1.125819 | \n", "1.125819 | \n", "0.094342 | \n", "1.522265 | \n", "0.862773 | \n", "0.194490 | \n", "0.438600 | \n", "0.574497 | \n", "0.045112 | \n", "0.945981 | \n", "1.923382 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
410770 | \n", "0.592011 | \n", "0.355014 | \n", "0.355014 | \n", "-0.120841 | \n", "0.206926 | \n", "0.714499 | \n", "0.226990 | \n", "-0.120035 | \n", "0.792500 | \n", "-0.037463 | \n", "0.974381 | \n", "0.789871 | \n", "
410771 | \n", "0.240478 | \n", "0.392697 | \n", "0.392697 | \n", "-0.296252 | \n", "-0.297209 | \n", "-0.551021 | \n", "-0.560144 | \n", "-0.716150 | \n", "-0.160491 | \n", "-0.354582 | \n", "-0.406361 | \n", "0.574534 | \n", "
410772 | \n", "-1.936771 | \n", "-0.688830 | \n", "-0.688830 | \n", "-0.226261 | \n", "-1.192706 | \n", "0.306280 | \n", "0.100868 | \n", "-0.454880 | \n", "-0.211576 | \n", "-0.487813 | \n", "0.170166 | \n", "-1.663294 | \n", "
410773 | \n", "-0.748366 | \n", "-0.804077 | \n", "-0.804077 | \n", "1.570163 | \n", "1.274445 | \n", "-0.002521 | \n", "0.745507 | \n", "2.828890 | \n", "-0.160491 | \n", "1.581436 | \n", "-0.843150 | \n", "-1.052343 | \n", "
410774 | \n", "1.257769 | \n", "-1.101815 | \n", "-1.101815 | \n", "-0.002742 | \n", "1.338996 | \n", "0.635065 | \n", "-0.040302 | \n", "0.202136 | \n", "1.217910 | \n", "0.311575 | \n", "-0.415359 | \n", "-0.246790 | \n", "
410775 rows × 12 columns
\n", "\n", " | num__geo_lat | \n", "num__geo_lon | \n", "num__level | \n", "num__rooms | \n", "num__kitchen_area | \n", "cat__region | \n", "quantile__geo_lat | \n", "quantile__geo_lon | \n", "quantile__level | \n", "poly__area kitchen_area | \n", "spline__area_sp_0 | \n", "spline__area_sp_2 | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0.495902 | \n", "-0.449742 | \n", "0.359235 | \n", "0.253413 | \n", "-0.186285 | \n", "20.0 | \n", "0.766257 | \n", "0.511028 | \n", "0.717217 | \n", "-0.132188 | \n", "0.155806 | \n", "0.178013 | \n", "
1 | \n", "0.177806 | \n", "1.433673 | \n", "-0.246529 | \n", "0.253413 | \n", "-0.186285 | \n", "70.0 | \n", "0.297142 | \n", "0.867999 | \n", "0.522022 | \n", "-0.169370 | \n", "0.156921 | \n", "0.176803 | \n", "
2 | \n", "0.440548 | \n", "0.047222 | \n", "-0.448450 | \n", "-0.669085 | \n", "-0.142544 | \n", "15.0 | \n", "0.732330 | \n", "0.629984 | \n", "0.417417 | \n", "-0.226261 | \n", "0.159080 | \n", "0.174488 | \n", "
3 | \n", "-1.588818 | \n", "-0.722477 | \n", "-0.246529 | \n", "0.253413 | \n", "-0.142544 | \n", "18.0 | \n", "0.148789 | \n", "0.295262 | \n", "0.522022 | \n", "-0.165302 | \n", "0.157341 | \n", "0.176349 | \n", "
4 | \n", "1.493662 | \n", "1.125819 | \n", "0.157313 | \n", "0.253413 | \n", "-0.011322 | \n", "10.0 | \n", "0.985937 | \n", "0.758363 | \n", "0.662663 | \n", "0.094342 | \n", "0.152390 | \n", "0.181792 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
410770 | \n", "0.592011 | \n", "0.355014 | \n", "0.561156 | \n", "0.253413 | \n", "-0.092653 | \n", "54.0 | \n", "0.788393 | \n", "0.686728 | \n", "0.771271 | \n", "-0.120841 | \n", "0.156705 | \n", "0.177037 | \n", "
410771 | \n", "0.240478 | \n", "0.392697 | \n", "-0.650371 | \n", "0.253413 | \n", "-0.404989 | \n", "45.0 | \n", "0.494062 | \n", "0.717240 | \n", "0.309810 | \n", "-0.296252 | \n", "0.158306 | \n", "0.175314 | \n", "
410772 | \n", "-1.936771 | \n", "-0.688830 | \n", "0.359235 | \n", "-0.669085 | \n", "-0.142544 | \n", "18.0 | \n", "0.131352 | \n", "0.327613 | \n", "0.717217 | \n", "-0.226261 | \n", "0.159080 | \n", "0.174488 | \n", "
410773 | \n", "-0.748366 | \n", "-0.804077 | \n", "-0.650371 | \n", "0.253413 | \n", "1.501833 | \n", "52.0 | \n", "0.193143 | \n", "0.114753 | \n", "0.309810 | \n", "1.570163 | \n", "0.147820 | \n", "0.187011 | \n", "
410774 | \n", "1.257769 | \n", "-1.101815 | \n", "-0.044608 | \n", "1.175911 | \n", "-0.142544 | \n", "14.0 | \n", "0.908036 | \n", "0.075725 | \n", "0.604605 | \n", "-0.002742 | \n", "0.152767 | \n", "0.181370 | \n", "
410775 rows × 12 columns
\n", "\n", " | num__geo_lon | \n", "quantile__geo_lat | \n", "spline__area_sp_3 | \n", "
---|---|---|---|
0 | \n", "-0.449742 | \n", "0.766257 | \n", "1.826008e-06 | \n", "
1 | \n", "1.433673 | \n", "0.297142 | \n", "1.310449e-06 | \n", "
2 | \n", "0.047222 | \n", "0.732330 | \n", "6.098363e-07 | \n", "
3 | \n", "-0.722477 | \n", "0.148789 | \n", "1.144942e-06 | \n", "
4 | \n", "1.125819 | \n", "0.985937 | \n", "4.240047e-06 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
410770 | \n", "0.355014 | \n", "0.788393 | \n", "1.401454e-06 | \n", "
410771 | \n", "0.392697 | \n", "0.494062 | \n", "8.202272e-07 | \n", "
410772 | \n", "-0.688830 | \n", "0.131352 | \n", "6.098363e-07 | \n", "
410773 | \n", "-0.804077 | \n", "0.193143 | \n", "1.004843e-05 | \n", "
410774 | \n", "-1.101815 | \n", "0.908036 | \n", "3.903343e-06 | \n", "
410775 rows × 3 columns
\n", "