@ -39,13 +39,15 @@ mlflow_registry_uri: Optional[str] = None
# URL сервера registry MLFlow (если не указан, используется `mlflow_tracking_server_uri`).
mlflow_do_log : bool = False
# Записывать ли прогон (run) в MLFlow; если True, при каждом исполнении блокнота создаётся новый прогон с именем `mlflow_run_name` .
# Записывать ли прогон ы (runs ) в MLFlow.
mlflow_experiment_id : Optional [ str ] = None
# ID эксперимента MLFlow, имеет приоритет над `mlflow_experiment_name`.
mlflow_experiment_name : Optional [ str ] = ' Current price predicion for used cars '
# Имя эксперимента MLFlow (ниже приоритетом, чем `mlflow_experiment_id`).
mlflow_run_name : str = ' Baseline model '
# Имя нового прогона MLFlow (используется для создания нового прогона, если `mlflow_do_log` установлен в True).
mlflow_baseline_run_name : str = ' Baseline model '
# Имя ноговго прогона MLFlow для baseline модели.
mlflow_feateng_run_name : str = ' Model with engineered features '
# Имя ноговго прогона MLFlow для модели, использующей дополнительные признаки
# %%
import os
@ -75,7 +77,7 @@ sys.path.insert(0, str(CODE_PATH.resolve()))
from iis_project . sklearn_utils import filter_params
from iis_project . sklearn_utils . compose import COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE
from iis_project . sklearn_utils . ensemble import RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE
from iis_project . sklearn_utils . p ipeline import PIPELINE_PARAMS_COMMON_INCLUDE
from iis_project . sklearn_utils . p andas import pandas_dataframe_from_transformed_artifacts
from iis_project . sklearn_utils . preprocessing import STANDARD_SCALER_PARAMS_COMMON_EXCLUDE
# %%
@ -112,23 +114,17 @@ with open(
df_orig = pickle . load ( input_file )
# %% [markdown]
# Обзор строк датасета:
# %%
df_orig . head ( 0x10 )
# %% [markdown]
# Размер датасета:
# Обзор датасета:
# %%
len ( df_orig )
# %% [markdown]
# Количество непустых значений и тип каждого столбца:
# %%
df_orig . info ( )
# %%
df_orig . head ( 0x10 )
# %% [markdown]
# ## Разделение датасета на выборки
@ -196,7 +192,7 @@ df_orig_features_train, df_orig_features_test, df_target_train, df_target_test =
tuple ( map ( len , ( df_target_train , df_target_test ) ) )
# %% [markdown]
# ## Создание пайплайнов обработки признаков и обучения м одели
# ## М одели
# %%
#MODEL_PIP_REQUIREMENTS_PATH = BASE_PATH / 'requirements' / 'requirements-isolated-research-model.txt'
@ -208,6 +204,7 @@ tuple(map(len, (df_target_train, df_target_test)))
mlflow_model_signature = mlflow . models . infer_signature ( model_input = df_orig_features , model_output = df_target )
mlflow_model_signature
# %% [raw] vscode={"languageId": "raw"}
# input_schema = mlflow.types.schema.Schema([
# mlflow.types.schema.ColSpec("double", "selling_price"),
@ -224,33 +221,230 @@ mlflow_model_signature
#
# mlflow_model_signature = mlflow.models.ModelSignature(inputs=input_schema, outputs=output_schema)
# %%
def build_features_scaler_standard ( ) :
return sklearn . preprocessing . StandardScaler ( )
# %%
#def build_categorical_features_encoder_onehot():
# return sklearn.preprocessing.OneHotEncoder()
def build_categorical_features_encoder_target ( * , random_state = None ) :
return sklearn . preprocessing . TargetEncoder (
target_type = ' continuous ' , smooth = ' auto ' , shuffle = True , random_state = random_state ,
)
# %% [markdown]
# Регрессор — небольшой случайный лес, цель — минимизация квадрата ошибки предсказания:
# %%
def build_regressor ( * , random_state = None ) :
return sklearn . ensemble . RandomForestRegressor (
10 , criterion = ' squared_error ' , max_features = ' sqrt ' , random_state = random_state ,
)
# %%
def score_predictions ( target_test , target_test_predicted ) :
return {
' mse ' : sklearn . metrics . mean_squared_error ( target_test , target_test_predicted ) ,
' mae ' : sklearn . metrics . mean_absolute_error ( target_test , target_test_predicted ) ,
' mape ' : sklearn . metrics . mean_absolute_percentage_error ( target_test , target_test_predicted ) ,
}
# %%
# использует глобальные переменные mlflow_do_log, mlflow_experiment
def mlflow_log_model (
model ,
model_params ,
metrics ,
* ,
run_name ,
model_signature = None ,
input_example = None ,
#pip_requirements=None,
comment_file_path = None ,
) :
if not mlflow_do_log :
return
with mlflow . start_run ( experiment_id = mlflow_experiment . experiment_id , run_name = run_name ) :
_ = mlflow . sklearn . log_model (
model ,
' model ' ,
signature = model_signature ,
input_example = input_example ,
#pip_requirements=pip_requirements,
)
if model_params is not None :
_ = mlflow . log_params ( model_params )
if metrics is not None :
_ = mlflow . log_metrics ( metrics )
if ( comment_file_path is not None ) and comment_file_path . exists ( ) :
mlflow . log_artifact ( str ( comment_file_path ) )
# %% [markdown]
# ### Baseline модель
# %% [markdown]
# Пайплайн предобработки признаков:
# %%
preprocess_transformer = sklearn . compose . ColumnTransformer (
[
( ' scale_to_standard ' , sklearn . preprocessing . StandardScaler ( ) , features_to_scale_to_standard_columns ) ,
( ' scale_to_standard ' , build_features_scaler_standard ( ) , features_to_scale_to_standard_columns ) ,
(
#'encode_categoricals_one_hot',
' encode_categoricals_wrt_target ' ,
#sklearn.preprocessing.OneHotEncoder(),
sklearn . preprocessing . TargetEncoder (
target_type = ' continuous ' , smooth = ' auto ' , shuffle = True , random_state = 0x2ED6 ,
) ,
#build_categorical_features_encoder_onehot(),
build_categorical_features_encoder_target ( random_state = 0x2ED6 ) ,
features_to_encode_wrt_target_columns ,
) ,
] ,
remainder = ' drop ' ,
)
# %%
regressor = build_regressor ( random_state = 0x016B )
regressor
# %% [markdown]
# Регрессор — небольшой случайный лес, цель — минимизация квадрата ошибки предсказания:
# Составной пайплайн:
# %%
pipeline = sklearn . pipeline . Pipeline ( [
( ' preprocess ' , preprocess_transformer ) ,
( ' regress ' , regressor ) ,
] )
pipeline
# %%
model_params = filter_params (
pipeline . get_params ( ) ,
include = {
' preprocess ' : (
False ,
{
* * { k : True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE } ,
' scale_to_standard ' : True ,
' encode_categorical_wrt_target ' : True ,
} ,
) ,
' regress ' : ( False , True ) ,
} ,
exclude = {
' preprocess ' : { ' scale_to_standard ' : STANDARD_SCALER_PARAMS_COMMON_EXCLUDE } ,
' regress ' : RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE ,
} ,
)
model_params
# %% [markdown]
# Обучение модели:
# %%
_ = pipeline . fit ( df_orig_features_train , df_target_train . iloc [ : , 0 ] )
# %% [markdown]
# Оценка качества:
# %%
target_test_predicted = pipeline . predict ( df_orig_features_test )
# %% [markdown]
# Метрики качества (MAPE, а также MSE, MAE):
# %%
regressor = sklearn . ensemble . RandomForestRegressor (
10 , criterion = ' squared_error ' , max_features = ' sqrt ' , random_state = 0x016B ,
metrics = score_predictions ( df_target_test , target_test_predicted )
metrics
# %%
mlflow_log_model (
pipeline ,
model_params = model_params ,
metrics = { k : float ( v ) for k , v in metrics . items ( ) } ,
run_name = mlflow_baseline_run_name ,
model_signature = mlflow_model_signature ,
input_example = df_orig_features . head ( MODEL_INOUT_EXAMPLE_SIZE ) ,
#pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH),
comment_file_path = (
model_comment_path
if model_comment_path is not None
else ( BASE_PATH / ' research ' / model_comment_relpath )
) ,
)
# %% [markdown]
# ### Модель с дополнительными признаками
# %% [markdown]
# Пайплайн предобработки признаков:
# %%
features_to_extend_as_polynomial = ( ' selling_price ' , ' driven_kms ' )
features_to_extend_as_spline = ( ' age ' , )
# %%
assert set ( features_to_extend_as_polynomial ) < = { * features_to_scale_to_standard_columns }
assert set ( features_to_extend_as_spline ) < = { * features_to_scale_to_standard_columns }
preprocess_transformer = sklearn . compose . ColumnTransformer (
[
(
' extend_features_as_polynomial ' ,
sklearn . pipeline . Pipeline ( [
(
' extend_features ' ,
sklearn . preprocessing . PolynomialFeatures ( 2 , include_bias = False ) ,
) ,
( ' scale_to_standard ' , build_features_scaler_standard ( ) ) ,
] ) ,
features_to_extend_as_polynomial ,
) ,
(
' extend_features_as_spline ' ,
sklearn . preprocessing . SplineTransformer (
4 , knots = ' quantile ' , extrapolation = ' constant ' , include_bias = False ,
) ,
features_to_extend_as_spline ,
) ,
(
' scale_to_standard ' ,
build_features_scaler_standard ( ) ,
tuple ( filter ( lambda f : f not in features_to_extend_as_polynomial , features_to_scale_to_standard_columns ) ) ,
) ,
(
' encode_categoricals_wrt_target ' ,
build_categorical_features_encoder_target ( random_state = 0x2ED6 ) ,
features_to_encode_wrt_target_columns ,
) ,
] ,
remainder = ' drop ' ,
)
preprocess_transformer
# %% [markdown]
# Демонстрация предобработки данных:
# %%
df_tfd_features_matrix_test = preprocess_transformer . fit_transform ( df_orig_features_test , df_target_test . iloc [ : , 0 ] )
df_tfd_features_test = pandas_dataframe_from_transformed_artifacts ( df_tfd_features_matrix_test , preprocess_transformer )
# %% [markdown]
# Обзор предобработанного датасета:
# %%
df_tfd_features_test . info ( )
# %%
df_tfd_features_test . head ( 0x8 )
# %%
regressor = build_regressor ( random_state = 0x3AEF )
regressor
# %% [markdown]
# Составной пайплайн:
@ -266,11 +460,15 @@ pipeline
model_params = filter_params (
pipeline . get_params ( ) ,
include = {
* * { k : True for k in PIPELINE_PARAMS_COMMON_INCLUDE } ,
' preprocess ' : (
False ,
{
* * { k : True for k in COLUMN_TRANSFORMER_PARAMS_COMMON_INCLUDE } ,
' extend_features_as_polynomial ' : {
' extend_features ' : True ,
' scale_to_standard ' : True ,
} ,
' extend_features_as_spline ' : True ,
' scale_to_standard ' : True ,
' encode_categorical_wrt_target ' : True ,
} ,
@ -278,18 +476,26 @@ model_params = filter_params(
' regress ' : ( False , True ) ,
} ,
exclude = {
' preprocess ' : { ' scale_to_standard ' : STANDARD_SCALER_PARAMS_COMMON_EXCLUDE } ,
' preprocess ' : {
' extend_features_as_polynomial ' : {
' scale_to_standard ' : STANDARD_SCALER_PARAMS_COMMON_EXCLUDE ,
} ,
' scale_to_standard ' : STANDARD_SCALER_PARAMS_COMMON_EXCLUDE ,
} ,
' regress ' : RANDOM_FOREST_REGRESSOR_PARAMS_COMMON_EXCLUDE ,
} ,
)
model_params
# %% [markdown]
# ## Baseline модель
# Обучение модели:
# %%
_ = pipeline . fit ( df_orig_features_train , df_target_train . iloc [ : , 0 ] )
# %% [markdown]
# Оценка качества:
# %%
target_test_predicted = pipeline . predict ( df_orig_features_test )
@ -297,31 +503,23 @@ target_test_predicted = pipeline.predict(df_orig_features_test)
# Метрики качества (MAPE, а также MSE, MAE):
# %%
metrics = {
' mse ' : sklearn . metrics . mean_squared_error ( df_target_test , target_test_predicted ) ,
' mae ' : sklearn . metrics . mean_absolute_error ( df_target_test , target_test_predicted ) ,
' mape ' : sklearn . metrics . mean_absolute_percentage_error ( df_target_test , target_test_predicted ) ,
}
metrics = score_predictions ( df_target_test , target_test_predicted )
metrics
# %%
metrics
mlflow_log_model (
pipeline ,
model_params = model_params ,
metrics = { k : float ( v ) for k , v in metrics . items ( ) } ,
run_name = mlflow_feateng_run_name ,
model_signature = mlflow_model_signature ,
input_example = df_orig_features . head ( MODEL_INOUT_EXAMPLE_SIZE ) ,
#pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH),
comment_file_path = (
model_comment_path
if model_comment_path is not None
else ( BASE_PATH / ' research ' / model_comment_relpath )
) ,
)
# %%
if mlflow_do_log :
with mlflow . start_run ( experiment_id = mlflow_experiment . experiment_id , run_name = mlflow_run_name ) :
_ = mlflow . sklearn . log_model (
pipeline ,
' model ' ,
signature = mlflow_model_signature ,
input_example = df_orig_features . head ( MODEL_INOUT_EXAMPLE_SIZE ) ,
#pip_requirements=str(MODEL_PIP_REQUIREMENTS_PATH),
)
_ = mlflow . log_params ( model_params )
_ = mlflow . log_metrics ( { k : float ( v ) for k , v in metrics . items ( ) } )
comment_file_path = (
model_comment_path
if model_comment_path is not None
else ( BASE_PATH / ' research ' / model_comment_relpath )
)
if comment_file_path . exists ( ) :
mlflow . log_artifact ( str ( comment_file_path ) )