{ "cells": [ { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import sklearn.metrics.pairwise as pw\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "A = [[3, 4, 5]]\n", "B = [[3, 5, 4]]\n", "C = [[1, 2, 1]]\n" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Euclidean: \t [[1.41421356]]\n", "Cosine: \t [[0.98]]\n", "Manhattan: \t [[2.]]\n" ] } ], "source": [ "print('Euclidean: \\t',pw.euclidean_distances(A, B))\n", "print('Cosine: \\t',pw.cosine_similarity(A, B))\n", "print('Manhattan: \\t',pw.manhattan_distances(A, B))" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Euclidean: \t [[4.89897949]]\n", "Cosine: \t [[0.92376043]]\n", "Manhattan: \t [[8.]]\n" ] } ], "source": [ "print('Euclidean: \\t',pw.euclidean_distances(A, C))\n", "print('Cosine: \\t',pw.cosine_similarity(A, C))\n", "print('Manhattan: \\t',pw.manhattan_distances(A, C))" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Euclidean E-D: \t [[3.31662479]]\n", "Euclidean E-F: \t [[2.82842712]]\n", "\n", "Cosine: E-D \t [[0.99014754]]\n", "Cosine E-F: \t [[0.7592566]]\n", "\n", "Manhattan: E-D \t [[5.]]\n", "Manhattan E-F: \t [[6.]]\n" ] } ], "source": [ "import sklearn.metrics.pairwise as pw\n", "\n", "D = [[6,0,0,3,3]]\n", "E = [[3,0,0,2,2]]\n", "F = [[1,1,1,1,1]]\n", "\n", "print('Euclidean E-D: \\t',pw.euclidean_distances(D, E))\n", "print('Euclidean E-F: \\t',pw.euclidean_distances(E, F))\n", "\n", "print('\\nCosine: E-D \\t',pw.cosine_similarity(D, E))\n", "print('Cosine E-F: \\t',pw.cosine_similarity(E, F))\n", "\n", "print('\\nManhattan: E-D \\t',pw.manhattan_distances(D, E))\n", "print('Manhattan E-F: \\t',pw.manhattan_distances(E, F))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ABCD
01.0red3300MSK
12.0red1250SPB
2NaNyellow4600EKB
32.0green4500MSK
\n", "
" ], "text/plain": [ " A B C D\n", "0 1.0 red 3300 MSK\n", "1 2.0 red 1250 SPB\n", "2 NaN yellow 4600 EKB\n", "3 2.0 green 4500 MSK" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "dataset = pd.DataFrame({'A': [1 , 2, None, 2], \n", " 'B': ['red', 'red', 'yellow', 'green'], \n", " 'C': [3300, 1250, 4600, 4500],\n", " 'D': ['MSK', 'SPB', 'EKB', 'MSK']})\n", "dataset" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ACDB_greenB_redB_yellow
01.03300MSK010
12.01250SPB010
2NaN4600EKB001
32.04500MSK100
\n", "
" ], "text/plain": [ " A C D B_green B_red B_yellow\n", "0 1.0 3300 MSK 0 1 0\n", "1 2.0 1250 SPB 0 1 0\n", "2 NaN 4600 EKB 0 0 1\n", "3 2.0 4500 MSK 1 0 0" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# OHE encoding\n", "dataset = pd.get_dummies(dataset, columns = ['B'])\n", "dataset" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LabelEncoder()" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Label encoding\n", "from sklearn import preprocessing\n", "le = preprocessing.LabelEncoder()\n", "le.fit(dataset['D'])" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ACDB_greenB_redB_yellow
01.033001010
12.012502010
2NaN46000001
32.045001100
\n", "
" ], "text/plain": [ " A C D B_green B_red B_yellow\n", "0 1.0 3300 1 0 1 0\n", "1 2.0 1250 2 0 1 0\n", "2 NaN 4600 0 0 0 1\n", "3 2.0 4500 1 1 0 0" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['D'] = le.transform(dataset['D'])\n", "dataset" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ACDB_greenB_redB_yellow
01.00000033001010
12.00000012502010
21.66666746000001
32.00000045001100
\n", "
" ], "text/plain": [ " A C D B_green B_red B_yellow\n", "0 1.000000 3300 1 0 1 0\n", "1 2.000000 1250 2 0 1 0\n", "2 1.666667 4600 0 0 0 1\n", "3 2.000000 4500 1 1 0 0" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Заполняем пропущенные данные\n", "dataset['A'] = dataset['A'].fillna(np.mean(dataset['A']))\n", "dataset" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ACDB_greenB_redB_yellowC_normalizedC_standardized
01.000000330010100.611940-0.072209
12.000000125020100.000000-1.388018
21.666667460000011.0000000.762206
32.000000450011000.9701490.698021
\n", "
" ], "text/plain": [ " A C D B_green B_red B_yellow C_normalized C_standardized\n", "0 1.000000 3300 1 0 1 0 0.611940 -0.072209\n", "1 2.000000 1250 2 0 1 0 0.000000 -1.388018\n", "2 1.666667 4600 0 0 0 1 1.000000 0.762206\n", "3 2.000000 4500 1 1 0 0 0.970149 0.698021" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())\n", "dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()\n", "dataset" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 4 }