diff --git a/lections/OATD_lec_2.pdf b/lections/OATD_lec_2.pdf index 9bed6cc..30e993d 100644 Binary files a/lections/OATD_lec_2.pdf and b/lections/OATD_lec_2.pdf differ diff --git a/lections/OATD_lec_2.pptx b/lections/OATD_lec_2.pptx index 48cde56..9937514 100644 Binary files a/lections/OATD_lec_2.pptx and b/lections/OATD_lec_2.pptx differ diff --git a/lections/notebooks/lec2_preprocess.ipynb b/lections/notebooks/lec2_preprocess.ipynb new file mode 100644 index 0000000..4a0cf51 --- /dev/null +++ b/lections/notebooks/lec2_preprocess.ipynb @@ -0,0 +1,634 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np \n", + "import sklearn.metrics.pairwise as pw\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "A = [[3, 4, 5]]\n", + "B = [[3, 5, 4]]\n", + "C = [[1, 2, 1]]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Euclidean: \t [[1.41421356]]\n", + "Cosine: \t [[0.98]]\n", + "Manhattan: \t [[2.]]\n" + ] + } + ], + "source": [ + "print('Euclidean: \\t',pw.euclidean_distances(A, B))\n", + "print('Cosine: \\t',pw.cosine_similarity(A, B))\n", + "print('Manhattan: \\t',pw.manhattan_distances(A, B))" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Euclidean: \t [[4.89897949]]\n", + "Cosine: \t [[0.92376043]]\n", + "Manhattan: \t [[8.]]\n" + ] + } + ], + "source": [ + "print('Euclidean: \\t',pw.euclidean_distances(A, C))\n", + "print('Cosine: \\t',pw.cosine_similarity(A, C))\n", + "print('Manhattan: \\t',pw.manhattan_distances(A, C))" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Euclidean E-D: \t [[3.31662479]]\n", + "Euclidean E-F: \t [[2.82842712]]\n", + "\n", + "Cosine: E-D \t [[0.99014754]]\n", + "Cosine E-F: \t [[0.7592566]]\n", + "\n", + "Manhattan: E-D \t [[5.]]\n", + "Manhattan E-F: \t [[6.]]\n" + ] + } + ], + "source": [ + "import sklearn.metrics.pairwise as pw\n", + "\n", + "D = [[6,0,0,3,3]]\n", + "E = [[3,0,0,2,2]]\n", + "F = [[1,1,1,1,1]]\n", + "\n", + "print('Euclidean E-D: \\t',pw.euclidean_distances(D, E))\n", + "print('Euclidean E-F: \\t',pw.euclidean_distances(E, F))\n", + "\n", + "print('\\nCosine: E-D \\t',pw.cosine_similarity(D, E))\n", + "print('Cosine E-F: \\t',pw.cosine_similarity(E, F))\n", + "\n", + "print('\\nManhattan: E-D \\t',pw.manhattan_distances(D, E))\n", + "print('Manhattan E-F: \\t',pw.manhattan_distances(E, F))" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
01.0red3300MSK
12.0red1250SPB
2NaNyellow4600EKB
32.0green4500MSK
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "0 1.0 red 3300 MSK\n", + "1 2.0 red 1250 SPB\n", + "2 NaN yellow 4600 EKB\n", + "3 2.0 green 4500 MSK" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "dataset = pd.DataFrame({'A': [1 , 2, None, 2], \n", + " 'B': ['red', 'red', 'yellow', 'green'], \n", + " 'C': [3300, 1250, 4600, 4500],\n", + " 'D': ['MSK', 'SPB', 'EKB', 'MSK']})\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACDB_greenB_redB_yellow
01.03300MSK010
12.01250SPB010
2NaN4600EKB001
32.04500MSK100
\n", + "
" + ], + "text/plain": [ + " A C D B_green B_red B_yellow\n", + "0 1.0 3300 MSK 0 1 0\n", + "1 2.0 1250 SPB 0 1 0\n", + "2 NaN 4600 EKB 0 0 1\n", + "3 2.0 4500 MSK 1 0 0" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# OHE encoding\n", + "dataset = pd.get_dummies(dataset, columns = ['B'])\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LabelEncoder()" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Label encoding\n", + "from sklearn import preprocessing\n", + "le = preprocessing.LabelEncoder()\n", + "le.fit(dataset['D'])" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACDB_greenB_redB_yellow
01.033001010
12.012502010
2NaN46000001
32.045001100
\n", + "
" + ], + "text/plain": [ + " A C D B_green B_red B_yellow\n", + "0 1.0 3300 1 0 1 0\n", + "1 2.0 1250 2 0 1 0\n", + "2 NaN 4600 0 0 0 1\n", + "3 2.0 4500 1 1 0 0" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset['D'] = le.transform(dataset['D'])\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACDB_greenB_redB_yellow
01.00000033001010
12.00000012502010
21.66666746000001
32.00000045001100
\n", + "
" + ], + "text/plain": [ + " A C D B_green B_red B_yellow\n", + "0 1.000000 3300 1 0 1 0\n", + "1 2.000000 1250 2 0 1 0\n", + "2 1.666667 4600 0 0 0 1\n", + "3 2.000000 4500 1 1 0 0" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Заполняем пропущенные данные\n", + "dataset['A'] = dataset['A'].fillna(np.mean(dataset['A']))\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ACDB_greenB_redB_yellowC_normalizedC_standardized
01.000000330010100.611940-0.072209
12.000000125020100.000000-1.388018
21.666667460000011.0000000.762206
32.000000450011000.9701490.698021
\n", + "
" + ], + "text/plain": [ + " A C D B_green B_red B_yellow C_normalized C_standardized\n", + "0 1.000000 3300 1 0 1 0 0.611940 -0.072209\n", + "1 2.000000 1250 2 0 1 0 0.000000 -1.388018\n", + "2 1.666667 4600 0 0 0 1 1.000000 0.762206\n", + "3 2.000000 4500 1 1 0 0 0.970149 0.698021" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataset['C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())\n", + "dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()\n", + "dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}