diff --git a/lections/OATD_lec_2.pdf b/lections/OATD_lec_2.pdf
index 9bed6cc..30e993d 100644
Binary files a/lections/OATD_lec_2.pdf and b/lections/OATD_lec_2.pdf differ
diff --git a/lections/OATD_lec_2.pptx b/lections/OATD_lec_2.pptx
index 48cde56..9937514 100644
Binary files a/lections/OATD_lec_2.pptx and b/lections/OATD_lec_2.pptx differ
diff --git a/lections/notebooks/lec2_preprocess.ipynb b/lections/notebooks/lec2_preprocess.ipynb
new file mode 100644
index 0000000..4a0cf51
--- /dev/null
+++ b/lections/notebooks/lec2_preprocess.ipynb
@@ -0,0 +1,634 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import numpy as np \n",
+ "import sklearn.metrics.pairwise as pw\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 52,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "A = [[3, 4, 5]]\n",
+ "B = [[3, 5, 4]]\n",
+ "C = [[1, 2, 1]]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Euclidean: \t [[1.41421356]]\n",
+ "Cosine: \t [[0.98]]\n",
+ "Manhattan: \t [[2.]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Euclidean: \\t',pw.euclidean_distances(A, B))\n",
+ "print('Cosine: \\t',pw.cosine_similarity(A, B))\n",
+ "print('Manhattan: \\t',pw.manhattan_distances(A, B))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Euclidean: \t [[4.89897949]]\n",
+ "Cosine: \t [[0.92376043]]\n",
+ "Manhattan: \t [[8.]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Euclidean: \\t',pw.euclidean_distances(A, C))\n",
+ "print('Cosine: \\t',pw.cosine_similarity(A, C))\n",
+ "print('Manhattan: \\t',pw.manhattan_distances(A, C))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Euclidean E-D: \t [[3.31662479]]\n",
+ "Euclidean E-F: \t [[2.82842712]]\n",
+ "\n",
+ "Cosine: E-D \t [[0.99014754]]\n",
+ "Cosine E-F: \t [[0.7592566]]\n",
+ "\n",
+ "Manhattan: E-D \t [[5.]]\n",
+ "Manhattan E-F: \t [[6.]]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sklearn.metrics.pairwise as pw\n",
+ "\n",
+ "D = [[6,0,0,3,3]]\n",
+ "E = [[3,0,0,2,2]]\n",
+ "F = [[1,1,1,1,1]]\n",
+ "\n",
+ "print('Euclidean E-D: \\t',pw.euclidean_distances(D, E))\n",
+ "print('Euclidean E-F: \\t',pw.euclidean_distances(E, F))\n",
+ "\n",
+ "print('\\nCosine: E-D \\t',pw.cosine_similarity(D, E))\n",
+ "print('Cosine E-F: \\t',pw.cosine_similarity(E, F))\n",
+ "\n",
+ "print('\\nManhattan: E-D \\t',pw.manhattan_distances(D, E))\n",
+ "print('Manhattan E-F: \\t',pw.manhattan_distances(E, F))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A | \n",
+ " B | \n",
+ " C | \n",
+ " D | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.0 | \n",
+ " red | \n",
+ " 3300 | \n",
+ " MSK | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " red | \n",
+ " 1250 | \n",
+ " SPB | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " NaN | \n",
+ " yellow | \n",
+ " 4600 | \n",
+ " EKB | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2.0 | \n",
+ " green | \n",
+ " 4500 | \n",
+ " MSK | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A B C D\n",
+ "0 1.0 red 3300 MSK\n",
+ "1 2.0 red 1250 SPB\n",
+ "2 NaN yellow 4600 EKB\n",
+ "3 2.0 green 4500 MSK"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "dataset = pd.DataFrame({'A': [1 , 2, None, 2], \n",
+ " 'B': ['red', 'red', 'yellow', 'green'], \n",
+ " 'C': [3300, 1250, 4600, 4500],\n",
+ " 'D': ['MSK', 'SPB', 'EKB', 'MSK']})\n",
+ "dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A | \n",
+ " C | \n",
+ " D | \n",
+ " B_green | \n",
+ " B_red | \n",
+ " B_yellow | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.0 | \n",
+ " 3300 | \n",
+ " MSK | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 1250 | \n",
+ " SPB | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 4600 | \n",
+ " EKB | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2.0 | \n",
+ " 4500 | \n",
+ " MSK | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A C D B_green B_red B_yellow\n",
+ "0 1.0 3300 MSK 0 1 0\n",
+ "1 2.0 1250 SPB 0 1 0\n",
+ "2 NaN 4600 EKB 0 0 1\n",
+ "3 2.0 4500 MSK 1 0 0"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# OHE encoding\n",
+ "dataset = pd.get_dummies(dataset, columns = ['B'])\n",
+ "dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "LabelEncoder()"
+ ]
+ },
+ "execution_count": 58,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Label encoding\n",
+ "from sklearn import preprocessing\n",
+ "le = preprocessing.LabelEncoder()\n",
+ "le.fit(dataset['D'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A | \n",
+ " C | \n",
+ " D | \n",
+ " B_green | \n",
+ " B_red | \n",
+ " B_yellow | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.0 | \n",
+ " 3300 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2.0 | \n",
+ " 1250 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " NaN | \n",
+ " 4600 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2.0 | \n",
+ " 4500 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A C D B_green B_red B_yellow\n",
+ "0 1.0 3300 1 0 1 0\n",
+ "1 2.0 1250 2 0 1 0\n",
+ "2 NaN 4600 0 0 0 1\n",
+ "3 2.0 4500 1 1 0 0"
+ ]
+ },
+ "execution_count": 59,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset['D'] = le.transform(dataset['D'])\n",
+ "dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A | \n",
+ " C | \n",
+ " D | \n",
+ " B_green | \n",
+ " B_red | \n",
+ " B_yellow | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.000000 | \n",
+ " 3300 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2.000000 | \n",
+ " 1250 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1.666667 | \n",
+ " 4600 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2.000000 | \n",
+ " 4500 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A C D B_green B_red B_yellow\n",
+ "0 1.000000 3300 1 0 1 0\n",
+ "1 2.000000 1250 2 0 1 0\n",
+ "2 1.666667 4600 0 0 0 1\n",
+ "3 2.000000 4500 1 1 0 0"
+ ]
+ },
+ "execution_count": 60,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Заполняем пропущенные данные\n",
+ "dataset['A'] = dataset['A'].fillna(np.mean(dataset['A']))\n",
+ "dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A | \n",
+ " C | \n",
+ " D | \n",
+ " B_green | \n",
+ " B_red | \n",
+ " B_yellow | \n",
+ " C_normalized | \n",
+ " C_standardized | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1.000000 | \n",
+ " 3300 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0.611940 | \n",
+ " -0.072209 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2.000000 | \n",
+ " 1250 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0.000000 | \n",
+ " -1.388018 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1.666667 | \n",
+ " 4600 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 1.000000 | \n",
+ " 0.762206 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2.000000 | \n",
+ " 4500 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0.970149 | \n",
+ " 0.698021 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A C D B_green B_red B_yellow C_normalized C_standardized\n",
+ "0 1.000000 3300 1 0 1 0 0.611940 -0.072209\n",
+ "1 2.000000 1250 2 0 1 0 0.000000 -1.388018\n",
+ "2 1.666667 4600 0 0 0 1 1.000000 0.762206\n",
+ "3 2.000000 4500 1 1 0 0 0.970149 0.698021"
+ ]
+ },
+ "execution_count": 64,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dataset['C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())\n",
+ "dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()\n",
+ "dataset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}