From 41497aa039368d05f2475de8650220670a8281ac Mon Sep 17 00:00:00 2001 From: syropiatovvv Date: Wed, 15 Oct 2025 11:23:17 +0300 Subject: [PATCH] =?UTF-8?q?=D0=B1=D0=BB=D0=BE=D0=BA=D0=BD=D0=BE=D1=82=20re?= =?UTF-8?q?search?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- research/research.py | 173 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 research/research.py diff --git a/research/research.py b/research/research.py new file mode 100644 index 0000000..6213f39 --- /dev/null +++ b/research/research.py @@ -0,0 +1,173 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:percent +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.17.3 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% +from typing import Optional + +# %% +import os +import pathlib +import pickle + +# %% +import sklearn.compose +import sklearn.ensemble +import sklearn.metrics +import sklearn.model_selection +import sklearn.pipeline +import sklearn.preprocessing + +# %% +# %% tags=["parameters"] +data_path: Optional[str] = None +# Полный путь к файлу (CSV) с исходным датасетом. Если не установлен, ищется файл в `data/`. +data_relpath: str = 'cars.csv' +# Путь к файлу (CSV) с исходным датасетом относительно директории данных `data`. Игнорируется, если установлен data_path. + +data_aug_pickle_path: Optional[str] = None +# Полный путь к файлу (pickle) для сохранения очищенного датасета. Если не установлен, используется `data/`. +data_aug_pickle_relpath: str = 'cars.aug.pickle' +# Путь к файлу (pickle) для сохранения очищенного датасета относительно директории данных `data`. Игнорируется, если установлен data_aug_pickle_path. + +# %% +BASE_PATH = pathlib.Path('..') + +# %% +DATA_PATH = ( + pathlib.Path(os.path.dirname(data_path)) + if data_path is not None + else (BASE_PATH / 'data') +) + +# %% +with open( + ( + data_aug_pickle_path + if data_aug_pickle_path is not None + else (DATA_PATH / data_aug_pickle_relpath) + ), + 'rb', +) as input_file: + df_orig = pickle.load(input_file) + +# %% +df_orig.head(0x10) + +# %% +len(df_orig) + +# %% +df_orig.info() + +# %% +feature_columns = ( + 'selling_price', + 'driven_kms', + 'fuel_type', + 'selling_type', + 'transmission', + #'owner', + 'age', +) + +target_columns = ( + 'present_price', +) + +# %% +features_to_scale_to_standard_columns = ( + 'selling_price', + 'driven_kms', + 'age', +) +assert all( + (col in df_orig.select_dtypes(('number',)).columns) + for col in features_to_scale_to_standard_columns +) + +features_to_encode_one_hot_columns = ( + 'fuel_type', + 'selling_type', + 'transmission', + #'owner', +) +assert all( + (col in df_orig.select_dtypes(('category', 'object')).columns) + for col in features_to_encode_one_hot_columns +) + +# %% +df_orig_features = df_orig[list(feature_columns)] +df_target = df_orig[list(target_columns)] + +# %% +DF_TEST_PORTION = 0.25 + +# %% +df_orig_features_train, df_orig_features_test, df_target_train, df_target_test = ( + sklearn.model_selection.train_test_split( + df_orig_features, df_target, test_size=DF_TEST_PORTION, random_state=0x7AE6, + ) +) + +# %% +tuple(map(len, (df_target_train, df_target_test))) + +# %% +preprocess_transformer = sklearn.compose.ColumnTransformer( + [ + ('scale_to_standard', sklearn.preprocessing.StandardScaler(), features_to_scale_to_standard_columns), + ( + #'encode_categoricals_one_hot', + 'encode_categoricals_wrt_target', + #sklearn.preprocessing.OneHotEncoder(), + sklearn.preprocessing.TargetEncoder( + target_type='continuous', smooth='auto', cv=3, shuffle=True, random_state=0x2ED6, + ), + features_to_encode_one_hot_columns, + ), + ], + remainder='drop', +) + +# %% +regressor = sklearn.ensemble.RandomForestRegressor( + 10, criterion='squared_error', max_features='sqrt', random_state=0x016B, +) + +# %% +pipeline = sklearn.pipeline.Pipeline([ + ('preprocess', preprocess_transformer), + ('regress', regressor), +]) + +# %% +pipeline + +# %% +_ = pipeline.fit(df_orig_features_train, df_target_train.iloc[:, 0]) + +# %% +target_test_predicted = pipeline.predict(df_orig_features_test) + +# %% +metrics = { + 'mse': sklearn.metrics.mean_squared_error(df_target_test, target_test_predicted), + 'mae': sklearn.metrics.mean_absolute_error(df_target_test, target_test_predicted), + 'mape': sklearn.metrics.mean_absolute_percentage_error(df_target_test, target_test_predicted), +} + +# %% +metrics