{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pathlib\n", "import re\n", "import sys" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot\n", "import numpy\n", "import pandas" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "BASE_PATH = pathlib.Path('..')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "CODE_PATH = BASE_PATH\n", "sys.path.insert(0, str(CODE_PATH.resolve()))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import iis_project.pandas_utils\n", "import iis_project.plotting_utils" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "DATA_PATH = BASE_PATH / 'data'" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "df_orig = pandas.read_csv(DATA_PATH / 'cars.csv')\n", "df_orig = df_orig.rename(columns=lambda s: re.sub(r'\\s', '_', s.lower().replace(' ', '_')))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
car_nameyearselling_pricepresent_pricedriven_kmsfuel_typeselling_typetransmissionowner
0ritz20143.355.5927000PetrolDealerManual0
1sx420134.759.5443000DieselDealerManual0
2ciaz20177.259.856900PetrolDealerManual0
3wagon r20112.854.155200PetrolDealerManual0
4swift20144.606.8742450DieselDealerManual0
5vitara brezza20189.259.832071DieselDealerManual0
6ciaz20156.758.1218796PetrolDealerManual0
7s cross20156.508.6133429DieselDealerManual0
8ciaz20168.758.8920273DieselDealerManual0
9ciaz20157.458.9242367DieselDealerManual0
10alto 80020172.853.602135PetrolDealerManual0
11ciaz20156.8510.3851000DieselDealerManual0
12ciaz20157.509.9415000PetrolDealerAutomatic0
13ertiga20156.107.7126000PetrolDealerManual0
14dzire20092.257.2177427PetrolDealerManual0
15ertiga20167.7510.7943000DieselDealerManual0
\n", "
" ], "text/plain": [ " car_name year selling_price present_price driven_kms fuel_type \\\n", "0 ritz 2014 3.35 5.59 27000 Petrol \n", "1 sx4 2013 4.75 9.54 43000 Diesel \n", "2 ciaz 2017 7.25 9.85 6900 Petrol \n", "3 wagon r 2011 2.85 4.15 5200 Petrol \n", "4 swift 2014 4.60 6.87 42450 Diesel \n", "5 vitara brezza 2018 9.25 9.83 2071 Diesel \n", "6 ciaz 2015 6.75 8.12 18796 Petrol \n", "7 s cross 2015 6.50 8.61 33429 Diesel \n", "8 ciaz 2016 8.75 8.89 20273 Diesel \n", "9 ciaz 2015 7.45 8.92 42367 Diesel \n", "10 alto 800 2017 2.85 3.60 2135 Petrol \n", "11 ciaz 2015 6.85 10.38 51000 Diesel \n", "12 ciaz 2015 7.50 9.94 15000 Petrol \n", "13 ertiga 2015 6.10 7.71 26000 Petrol \n", "14 dzire 2009 2.25 7.21 77427 Petrol \n", "15 ertiga 2016 7.75 10.79 43000 Diesel \n", "\n", " selling_type transmission owner \n", "0 Dealer Manual 0 \n", "1 Dealer Manual 0 \n", "2 Dealer Manual 0 \n", "3 Dealer Manual 0 \n", "4 Dealer Manual 0 \n", "5 Dealer Manual 0 \n", "6 Dealer Manual 0 \n", "7 Dealer Manual 0 \n", "8 Dealer Manual 0 \n", "9 Dealer Manual 0 \n", "10 Dealer Manual 0 \n", "11 Dealer Manual 0 \n", "12 Dealer Automatic 0 \n", "13 Dealer Manual 0 \n", "14 Dealer Manual 0 \n", "15 Dealer Manual 0 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_orig.head(0x10)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "301" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df_orig)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lengthdtype
car_name301object
year301int64
selling_price301float64
present_price301float64
driven_kms301int64
fuel_type301object
selling_type301object
transmission301object
owner301int64
\n", "
" ], "text/plain": [ " length dtype\n", "car_name 301 object\n", "year 301 int64\n", "selling_price 301 float64\n", "present_price 301 float64\n", "driven_kms 301 int64\n", "fuel_type 301 object\n", "selling_type 301 object\n", "transmission 301 object\n", "owner 301 int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iis_project.pandas_utils.describe_df(df_orig)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "numeric_columns_orig = ('selling_price', 'present_price', 'driven_kms')\n", "categorical_columns_orig = ('car_name', 'fuel_type', 'selling_type', 'transmission', 'owner')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
selling_pricepresent_pricedriven_kms
count301.000000301.000000301.000000
mean4.6612967.62847236947.205980
std5.0828128.64258438886.883882
min0.1000000.320000500.000000
25%0.9000001.20000015000.000000
50%3.6000006.40000032000.000000
75%6.0000009.90000048767.000000
max35.00000092.600000500000.000000
\n", "
" ], "text/plain": [ " selling_price present_price driven_kms\n", "count 301.000000 301.000000 301.000000\n", "mean 4.661296 7.628472 36947.205980\n", "std 5.082812 8.642584 38886.883882\n", "min 0.100000 0.320000 500.000000\n", "25% 0.900000 1.200000 15000.000000\n", "50% 3.600000 6.400000 32000.000000\n", "75% 6.000000 9.900000 48767.000000\n", "max 35.000000 92.600000 500000.000000" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_orig[list(numeric_columns_orig)].describe()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "True" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all((len(s) == len(df_orig)) for _, s in df_orig.items())" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'car_name': (98 values)\n", "'fuel_type': 'Petrol', 'Diesel', 'CNG'\n", "'selling_type': 'Dealer', 'Individual'\n", "'transmission': 'Manual', 'Automatic'\n", "'owner': np.int64(0), np.int64(1), np.int64(3)\n" ] } ], "source": [ "categorical_values_for_columns_orig = {\n", " column: series.unique()\n", " for column, series in df_orig[list(categorical_columns_orig)].items()\n", "}\n", "\n", "for column, values in categorical_values_for_columns_orig.items():\n", " if len(values) <= 0x10:\n", " values_str = ', '.join(map(repr, values))\n", " else:\n", " values_str = f'({len(values)} values)'\n", " print(f'{column!r}: {values_str}')" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for column, series in df_orig[list(numeric_columns_orig)].items():\n", " _fig, _ax = matplotlib.pyplot.subplots()\n", " _ax.set_title(str(column))\n", " #_ax.set_xscale('symlog')\n", " _ax.set_yscale('log')\n", " _ax.grid(True)\n", " _ = _ax.hist(series, bins=iis_project.plotting_utils.suggest_bins_num(len(series)))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for column, series in df_orig[list(filter(lambda s: s not in ('car_name',), categorical_columns_orig))].items():\n", " _fig, _ax = matplotlib.pyplot.subplots()\n", " _ax.set_title(str(column))\n", " _ax.set_yscale('log')\n", " _ax.grid(True)\n", " value_counts = series.value_counts()\n", " _ = _ax.bar(tuple(map(str, value_counts.index)), value_counts)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "labels_to_drop_from_orig = []" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
car_nameyearselling_pricepresent_pricedriven_kmsfuel_typeselling_typetransmissionowner
85camry20062.523.73142000PetrolIndividualAutomatic3
\n", "
" ], "text/plain": [ " car_name year selling_price present_price driven_kms fuel_type \\\n", "85 camry 2006 2.5 23.73 142000 Petrol \n", "\n", " selling_type transmission owner \n", "85 Individual Automatic 3 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_orig.loc[df_orig['owner'].isin((3,))]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "labels_to_drop_from_orig.extend(df_orig.loc[df_orig['owner'].isin((3,))].index)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
car_nameyearselling_pricepresent_pricedriven_kmsfuel_typeselling_typetransmissionowner
18wagon r20153.255.0935500CNGDealerManual0
35sx420112.957.7449998CNGDealerManual0
86land cruiser201035.0092.6078000DieselDealerManual0
196Activa 3g20080.170.52500000PetrolIndividualAutomatic0
\n", "
" ], "text/plain": [ " car_name year selling_price present_price driven_kms fuel_type \\\n", "18 wagon r 2015 3.25 5.09 35500 CNG \n", "35 sx4 2011 2.95 7.74 49998 CNG \n", "86 land cruiser 2010 35.00 92.60 78000 Diesel \n", "196 Activa 3g 2008 0.17 0.52 500000 Petrol \n", "\n", " selling_type transmission owner \n", "18 Dealer Manual 0 \n", "35 Dealer Manual 0 \n", "86 Dealer Manual 0 \n", "196 Individual Automatic 0 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_orig.loc[(df_orig['present_price'] >= 60.) | (df_orig['driven_kms'] >= 400000) | (df_orig['fuel_type'].isin(('CNG',)))]" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "labels_to_drop_from_orig.extend((196,))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "df = df_orig.drop(labels_to_drop_from_orig)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "299" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(df)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lengthdtype
car_name299object
year299int64
selling_price299float64
present_price299float64
driven_kms299int64
fuel_type299object
selling_type299object
transmission299object
owner299int64
\n", "
" ], "text/plain": [ " length dtype\n", "car_name 299 object\n", "year 299 int64\n", "selling_price 299 float64\n", "present_price 299 float64\n", "driven_kms 299 int64\n", "fuel_type 299 object\n", "selling_type 299 object\n", "transmission 299 object\n", "owner 299 int64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "iis_project.pandas_utils.describe_df(df)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
selling_pricepresent_pricedriven_kms
count299.000000299.000000299.000000
mean4.6835457.59839535047.187291
std5.0916118.61133527607.236346
min0.1000000.320000500.000000
25%0.9000001.23000015000.000000
50%3.6500006.40000032000.000000
75%6.0000009.87500047500.000000
max35.00000092.600000213000.000000
\n", "
" ], "text/plain": [ " selling_price present_price driven_kms\n", "count 299.000000 299.000000 299.000000\n", "mean 4.683545 7.598395 35047.187291\n", "std 5.091611 8.611335 27607.236346\n", "min 0.100000 0.320000 500.000000\n", "25% 0.900000 1.230000 15000.000000\n", "50% 3.650000 6.400000 32000.000000\n", "75% 6.000000 9.875000 47500.000000\n", "max 35.000000 92.600000 213000.000000" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[list(numeric_columns_orig)].describe()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "'car_name': (97 values)\n", "'fuel_type': 'Petrol', 'Diesel', 'CNG'\n", "'selling_type': 'Dealer', 'Individual'\n", "'transmission': 'Manual', 'Automatic'\n", "'owner': np.int64(0), np.int64(1)\n" ] } ], "source": [ "categorical_values_for_columns = {\n", " column: series.unique()\n", " for column, series in df[list(categorical_columns_orig)].items()\n", "}\n", "\n", "for column, values in categorical_values_for_columns.items():\n", " if len(values) <= 0x10:\n", " values_str = ', '.join(map(repr, values))\n", " else:\n", " values_str = f'({len(values)} values)'\n", " print(f'{column!r}: {values_str}')" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for column, series in df_orig[['present_price', 'selling_price']].items():\n", " _fig, _ax = matplotlib.pyplot.subplots()\n", " _ax.set_title(str(column))\n", " _ax.set_xscale('symlog')\n", " _ax.set_yscale('log')\n", " _ax.grid(True)\n", " _ = _ax.hist(series, bins=numpy.logspace(\n", " numpy.log10(min(series)), numpy.log10(max(series)), (iis_project.plotting_utils.suggest_bins_num(len(series)) + 1), endpoint=True, base=10),\n", " )\n", " _ = _ax.set_xlim((0, None))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "_fig, _ax = matplotlib.pyplot.subplots()\n", "_ax.set_xscale('symlog')\n", "_ax.set_yscale('symlog')\n", "_ax.grid(True)\n", "_ = _ax.scatter(df['selling_price'], df['present_price'])" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }