Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
iis-project/eda/eda.ipynb

164 KiB

import pathlib
import re
import sys
import matplotlib.pyplot
import numpy
import pandas
BASE_PATH = pathlib.Path('..')
CODE_PATH = BASE_PATH
sys.path.insert(0, str(CODE_PATH.resolve()))
import iis_project.pandas_utils
import iis_project.plotting_utils
DATA_PATH = BASE_PATH / 'data'
df_orig = pandas.read_csv(DATA_PATH / 'cars.csv')
df_orig = df_orig.rename(columns=lambda s: re.sub(r'\s', '_', s.lower().replace(' ', '_')))
df_orig.head(0x10)
car_name year selling_price present_price driven_kms fuel_type selling_type transmission owner
0 ritz 2014 3.35 5.59 27000 Petrol Dealer Manual 0
1 sx4 2013 4.75 9.54 43000 Diesel Dealer Manual 0
2 ciaz 2017 7.25 9.85 6900 Petrol Dealer Manual 0
3 wagon r 2011 2.85 4.15 5200 Petrol Dealer Manual 0
4 swift 2014 4.60 6.87 42450 Diesel Dealer Manual 0
5 vitara brezza 2018 9.25 9.83 2071 Diesel Dealer Manual 0
6 ciaz 2015 6.75 8.12 18796 Petrol Dealer Manual 0
7 s cross 2015 6.50 8.61 33429 Diesel Dealer Manual 0
8 ciaz 2016 8.75 8.89 20273 Diesel Dealer Manual 0
9 ciaz 2015 7.45 8.92 42367 Diesel Dealer Manual 0
10 alto 800 2017 2.85 3.60 2135 Petrol Dealer Manual 0
11 ciaz 2015 6.85 10.38 51000 Diesel Dealer Manual 0
12 ciaz 2015 7.50 9.94 15000 Petrol Dealer Automatic 0
13 ertiga 2015 6.10 7.71 26000 Petrol Dealer Manual 0
14 dzire 2009 2.25 7.21 77427 Petrol Dealer Manual 0
15 ertiga 2016 7.75 10.79 43000 Diesel Dealer Manual 0
len(df_orig)
301
iis_project.pandas_utils.describe_df(df_orig)
length dtype
car_name 301 object
year 301 int64
selling_price 301 float64
present_price 301 float64
driven_kms 301 int64
fuel_type 301 object
selling_type 301 object
transmission 301 object
owner 301 int64
numeric_columns_orig = ('selling_price', 'present_price', 'driven_kms')
categorical_columns_orig = ('car_name', 'fuel_type', 'selling_type', 'transmission', 'owner')
df_orig[list(numeric_columns_orig)].describe()
selling_price present_price driven_kms
count 301.000000 301.000000 301.000000
mean 4.661296 7.628472 36947.205980
std 5.082812 8.642584 38886.883882
min 0.100000 0.320000 500.000000
25% 0.900000 1.200000 15000.000000
50% 3.600000 6.400000 32000.000000
75% 6.000000 9.900000 48767.000000
max 35.000000 92.600000 500000.000000
all((len(s) == len(df_orig)) for _, s in df_orig.items())
True
categorical_values_for_columns_orig = {
    column: series.unique()
    for column, series in df_orig[list(categorical_columns_orig)].items()
}

for column, values in categorical_values_for_columns_orig.items():
    if len(values) <= 0x10:
        values_str = ', '.join(map(repr, values))
    else:
        values_str = f'({len(values)} values)'
    print(f'{column!r}: {values_str}')
'car_name': (98 values)
'fuel_type': 'Petrol', 'Diesel', 'CNG'
'selling_type': 'Dealer', 'Individual'
'transmission': 'Manual', 'Automatic'
'owner': np.int64(0), np.int64(1), np.int64(3)
for column, series in df_orig[list(numeric_columns_orig)].items():
    _fig, _ax = matplotlib.pyplot.subplots()
    _ax.set_title(str(column))
    #_ax.set_xscale('symlog')
    _ax.set_yscale('log')
    _ax.grid(True)
    _ = _ax.hist(series, bins=iis_project.plotting_utils.suggest_bins_num(len(series)))

for column, series in df_orig[list(filter(lambda s: s not in ('car_name',), categorical_columns_orig))].items():
    _fig, _ax = matplotlib.pyplot.subplots()
    _ax.set_title(str(column))
    _ax.set_yscale('log')
    _ax.grid(True)
    value_counts = series.value_counts()
    _ = _ax.bar(tuple(map(str, value_counts.index)), value_counts)

labels_to_drop_from_orig = []
df_orig.loc[df_orig['owner'].isin((3,))]
car_name year selling_price present_price driven_kms fuel_type selling_type transmission owner
85 camry 2006 2.5 23.73 142000 Petrol Individual Automatic 3
labels_to_drop_from_orig.extend(df_orig.loc[df_orig['owner'].isin((3,))].index)
df_orig.loc[(df_orig['present_price'] >= 60.) | (df_orig['driven_kms'] >= 400000) | (df_orig['fuel_type'].isin(('CNG',)))]
car_name year selling_price present_price driven_kms fuel_type selling_type transmission owner
18 wagon r 2015 3.25 5.09 35500 CNG Dealer Manual 0
35 sx4 2011 2.95 7.74 49998 CNG Dealer Manual 0
86 land cruiser 2010 35.00 92.60 78000 Diesel Dealer Manual 0
196 Activa 3g 2008 0.17 0.52 500000 Petrol Individual Automatic 0
labels_to_drop_from_orig.extend((196,))
df = df_orig.drop(labels_to_drop_from_orig)
len(df)
299
iis_project.pandas_utils.describe_df(df)
length dtype
car_name 299 object
year 299 int64
selling_price 299 float64
present_price 299 float64
driven_kms 299 int64
fuel_type 299 object
selling_type 299 object
transmission 299 object
owner 299 int64
df[list(numeric_columns_orig)].describe()
selling_price present_price driven_kms
count 299.000000 299.000000 299.000000
mean 4.683545 7.598395 35047.187291
std 5.091611 8.611335 27607.236346
min 0.100000 0.320000 500.000000
25% 0.900000 1.230000 15000.000000
50% 3.650000 6.400000 32000.000000
75% 6.000000 9.875000 47500.000000
max 35.000000 92.600000 213000.000000
categorical_values_for_columns = {
    column: series.unique()
    for column, series in df[list(categorical_columns_orig)].items()
}

for column, values in categorical_values_for_columns.items():
    if len(values) <= 0x10:
        values_str = ', '.join(map(repr, values))
    else:
        values_str = f'({len(values)} values)'
    print(f'{column!r}: {values_str}')
'car_name': (97 values)
'fuel_type': 'Petrol', 'Diesel', 'CNG'
'selling_type': 'Dealer', 'Individual'
'transmission': 'Manual', 'Automatic'
'owner': np.int64(0), np.int64(1)
for column, series in df_orig[['present_price', 'selling_price']].items():
    _fig, _ax = matplotlib.pyplot.subplots()
    _ax.set_title(str(column))
    _ax.set_xscale('symlog')
    _ax.set_yscale('log')
    _ax.grid(True)
    _ = _ax.hist(series, bins=numpy.logspace(
        numpy.log10(min(series)), numpy.log10(max(series)), (iis_project.plotting_utils.suggest_bins_num(len(series)) + 1), endpoint=True, base=10),
    )
    _ = _ax.set_xlim((0, None))

_fig, _ax = matplotlib.pyplot.subplots()
_ax.set_xscale('symlog')
_ax.set_yscale('symlog')
_ax.grid(True)
_ = _ax.scatter(df['selling_price'], df['present_price'])