Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
36 KiB
36 KiB
import numpy as np
import sklearn.metrics.pairwise as pwA = [[3, 4000, 5]]
B = [[3, 4000, 4]]
C = [[3, 4100, 5]]print('Euclidean: \t',pw.euclidean_distances(A, B))
print('Cosine: \t',pw.cosine_similarity(A, B))
print('Manhattan: \t',pw.manhattan_distances(A, B))Euclidean: [[1.]]
Cosine: [[0.99999997]]
Manhattan: [[1.]]
print('Euclidean: \t',pw.euclidean_distances(A, C))
print('Cosine: \t',pw.cosine_similarity(A, C))
print('Manhattan: \t',pw.manhattan_distances(A, C))Euclidean: [[100.]]
Cosine: [[1.]]
Manhattan: [[100.]]
import sklearn.metrics.pairwise as pw
D = [[6,0,0,3,3]]
E = [[3,0,0,2,2]]
F = [[1,1,1,1,1]]
print('Euclidean E-D: \t',pw.euclidean_distances(D, E))
print('Euclidean E-F: \t',pw.euclidean_distances(E, F))
print('\nCosine: E-D \t',pw.cosine_similarity(D, E))
print('Cosine E-F: \t',pw.cosine_similarity(E, F))
print('\nManhattan: E-D \t',pw.manhattan_distances(D, E))
print('Manhattan E-F: \t',pw.manhattan_distances(E, F))Euclidean E-D: [[3.31662479]]
Euclidean E-F: [[2.82842712]]
Cosine: E-D [[0.99014754]]
Cosine E-F: [[0.7592566]]
Manhattan: E-D [[5.]]
Manhattan E-F: [[6.]]
import pandas as pd
dataset = pd.DataFrame({'A': [1 , 2, None, 2],
'B': ['red', 'red', 'yellow', 'green'],
'C': [3300, 1250, 4600, 4500],
'D': ['MSK', 'SPB', 'EKB', 'MSK']})
dataset| A | B | C | D | |
|---|---|---|---|---|
| 0 | 1.0 | red | 3300 | MSK |
| 1 | 2.0 | red | 1250 | SPB |
| 2 | NaN | yellow | 4600 | EKB |
| 3 | 2.0 | green | 4500 | MSK |
# OHE encoding
dataset = pd.get_dummies(dataset, columns = ['B'])
dataset| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 3300 | MSK | 0 | 1 | 0 |
| 1 | 2.0 | 1250 | SPB | 0 | 1 | 0 |
| 2 | NaN | 4600 | EKB | 0 | 0 | 1 |
| 3 | 2.0 | 4500 | MSK | 1 | 0 | 0 |
# Label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(dataset['D'])LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
dataset['D'] = le.transform(dataset['D'])
dataset| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 3300 | 1 | 0 | 1 | 0 |
| 1 | 2.0 | 1250 | 2 | 0 | 1 | 0 |
| 2 | NaN | 4600 | 0 | 0 | 0 | 1 |
| 3 | 2.0 | 4500 | 1 | 1 | 0 | 0 |
# Заполняем пропущенные данные
dataset['A'] = dataset['A'].fillna(np.mean(dataset['A']))
dataset| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 |
| 1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 |
| 2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 |
| 3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 |
dataset['C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())
dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()
dataset| A | C | D | B_green | B_red | B_yellow | C_normalized | C_standardized | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 | 0.611940 | -0.072209 |
| 1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 | 0.000000 | -1.388018 |
| 2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 | 1.000000 | 0.762206 |
| 3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 | 0.970149 | 0.698021 |
dataset.boxplot(['C_normalized','C_standardized'])<Axes: >
