{ "cells": [ { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "import numpy as np \n", "import sklearn.metrics.pairwise as pw\n" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "A = [[3, 4, 5]]\n", "B = [[3, 5, 4]]\n", "C = [[1, 2, 1]]\n" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Euclidean: \t [[1.41421356]]\n", "Cosine: \t [[0.98]]\n", "Manhattan: \t [[2.]]\n" ] } ], "source": [ "print('Euclidean: \\t',pw.euclidean_distances(A, B))\n", "print('Cosine: \\t',pw.cosine_similarity(A, B))\n", "print('Manhattan: \\t',pw.manhattan_distances(A, B))" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Euclidean: \t [[4.89897949]]\n", "Cosine: \t [[0.92376043]]\n", "Manhattan: \t [[8.]]\n" ] } ], "source": [ "print('Euclidean: \\t',pw.euclidean_distances(A, C))\n", "print('Cosine: \\t',pw.cosine_similarity(A, C))\n", "print('Manhattan: \\t',pw.manhattan_distances(A, C))" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Euclidean E-D: \t [[3.31662479]]\n", "Euclidean E-F: \t [[2.82842712]]\n", "\n", "Cosine: E-D \t [[0.99014754]]\n", "Cosine E-F: \t [[0.7592566]]\n", "\n", "Manhattan: E-D \t [[5.]]\n", "Manhattan E-F: \t [[6.]]\n" ] } ], "source": [ "import sklearn.metrics.pairwise as pw\n", "\n", "D = [[6,0,0,3,3]]\n", "E = [[3,0,0,2,2]]\n", "F = [[1,1,1,1,1]]\n", "\n", "print('Euclidean E-D: \\t',pw.euclidean_distances(D, E))\n", "print('Euclidean E-F: \\t',pw.euclidean_distances(E, F))\n", "\n", "print('\\nCosine: E-D \\t',pw.cosine_similarity(D, E))\n", "print('Cosine E-F: \\t',pw.cosine_similarity(E, F))\n", "\n", "print('\\nManhattan: E-D \\t',pw.manhattan_distances(D, E))\n", "print('Manhattan E-F: \\t',pw.manhattan_distances(E, F))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>A</th>\n", " <th>B</th>\n", " <th>C</th>\n", " <th>D</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.0</td>\n", " <td>red</td>\n", " <td>3300</td>\n", " <td>MSK</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2.0</td>\n", " <td>red</td>\n", " <td>1250</td>\n", " <td>SPB</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>NaN</td>\n", " <td>yellow</td>\n", " <td>4600</td>\n", " <td>EKB</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2.0</td>\n", " <td>green</td>\n", " <td>4500</td>\n", " <td>MSK</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " A B C D\n", "0 1.0 red 3300 MSK\n", "1 2.0 red 1250 SPB\n", "2 NaN yellow 4600 EKB\n", "3 2.0 green 4500 MSK" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "dataset = pd.DataFrame({'A': [1 , 2, None, 2], \n", " 'B': ['red', 'red', 'yellow', 'green'], \n", " 'C': [3300, 1250, 4600, 4500],\n", " 'D': ['MSK', 'SPB', 'EKB', 'MSK']})\n", "dataset" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>A</th>\n", " <th>C</th>\n", " <th>D</th>\n", " <th>B_green</th>\n", " <th>B_red</th>\n", " <th>B_yellow</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.0</td>\n", " <td>3300</td>\n", " <td>MSK</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2.0</td>\n", " <td>1250</td>\n", " <td>SPB</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>NaN</td>\n", " <td>4600</td>\n", " <td>EKB</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2.0</td>\n", " <td>4500</td>\n", " <td>MSK</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " A C D B_green B_red B_yellow\n", "0 1.0 3300 MSK 0 1 0\n", "1 2.0 1250 SPB 0 1 0\n", "2 NaN 4600 EKB 0 0 1\n", "3 2.0 4500 MSK 1 0 0" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# OHE encoding\n", "dataset = pd.get_dummies(dataset, columns = ['B'])\n", "dataset" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "LabelEncoder()" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Label encoding\n", "from sklearn import preprocessing\n", "le = preprocessing.LabelEncoder()\n", "le.fit(dataset['D'])" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>A</th>\n", " <th>C</th>\n", " <th>D</th>\n", " <th>B_green</th>\n", " <th>B_red</th>\n", " <th>B_yellow</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.0</td>\n", " <td>3300</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2.0</td>\n", " <td>1250</td>\n", " <td>2</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>NaN</td>\n", " <td>4600</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2.0</td>\n", " <td>4500</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " A C D B_green B_red B_yellow\n", "0 1.0 3300 1 0 1 0\n", "1 2.0 1250 2 0 1 0\n", "2 NaN 4600 0 0 0 1\n", "3 2.0 4500 1 1 0 0" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['D'] = le.transform(dataset['D'])\n", "dataset" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>A</th>\n", " <th>C</th>\n", " <th>D</th>\n", " <th>B_green</th>\n", " <th>B_red</th>\n", " <th>B_yellow</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.000000</td>\n", " <td>3300</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2.000000</td>\n", " <td>1250</td>\n", " <td>2</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1.666667</td>\n", " <td>4600</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2.000000</td>\n", " <td>4500</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " A C D B_green B_red B_yellow\n", "0 1.000000 3300 1 0 1 0\n", "1 2.000000 1250 2 0 1 0\n", "2 1.666667 4600 0 0 0 1\n", "3 2.000000 4500 1 1 0 0" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Заполняем пропущенные данные\n", "dataset['A'] = dataset['A'].fillna(np.mean(dataset['A']))\n", "dataset" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>A</th>\n", " <th>C</th>\n", " <th>D</th>\n", " <th>B_green</th>\n", " <th>B_red</th>\n", " <th>B_yellow</th>\n", " <th>C_normalized</th>\n", " <th>C_standardized</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>1.000000</td>\n", " <td>3300</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0.611940</td>\n", " <td>-0.072209</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>2.000000</td>\n", " <td>1250</td>\n", " <td>2</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0.000000</td>\n", " <td>-1.388018</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>1.666667</td>\n", " <td>4600</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>1</td>\n", " <td>1.000000</td>\n", " <td>0.762206</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>2.000000</td>\n", " <td>4500</td>\n", " <td>1</td>\n", " <td>1</td>\n", " <td>0</td>\n", " <td>0</td>\n", " <td>0.970149</td>\n", " <td>0.698021</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " A C D B_green B_red B_yellow C_normalized C_standardized\n", "0 1.000000 3300 1 0 1 0 0.611940 -0.072209\n", "1 2.000000 1250 2 0 1 0 0.000000 -1.388018\n", "2 1.666667 4600 0 0 0 1 1.000000 0.762206\n", "3 2.000000 4500 1 1 0 0 0.970149 0.698021" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())\n", "dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()\n", "dataset" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 4 }