Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
16 KiB
16 KiB
import numpy as np
import sklearn.metrics.pairwise as pw
= [[3, 4, 5]]
A = [[3, 5, 4]]
B = [[1, 2, 1]] C
print('Euclidean: \t',pw.euclidean_distances(A, B))
print('Cosine: \t',pw.cosine_similarity(A, B))
print('Manhattan: \t',pw.manhattan_distances(A, B))
Euclidean: [[1.41421356]]
Cosine: [[0.98]]
Manhattan: [[2.]]
print('Euclidean: \t',pw.euclidean_distances(A, C))
print('Cosine: \t',pw.cosine_similarity(A, C))
print('Manhattan: \t',pw.manhattan_distances(A, C))
Euclidean: [[4.89897949]]
Cosine: [[0.92376043]]
Manhattan: [[8.]]
import sklearn.metrics.pairwise as pw
= [[6,0,0,3,3]]
D = [[3,0,0,2,2]]
E = [[1,1,1,1,1]]
F
print('Euclidean E-D: \t',pw.euclidean_distances(D, E))
print('Euclidean E-F: \t',pw.euclidean_distances(E, F))
print('\nCosine: E-D \t',pw.cosine_similarity(D, E))
print('Cosine E-F: \t',pw.cosine_similarity(E, F))
print('\nManhattan: E-D \t',pw.manhattan_distances(D, E))
print('Manhattan E-F: \t',pw.manhattan_distances(E, F))
Euclidean E-D: [[3.31662479]]
Euclidean E-F: [[2.82842712]]
Cosine: E-D [[0.99014754]]
Cosine E-F: [[0.7592566]]
Manhattan: E-D [[5.]]
Manhattan E-F: [[6.]]
import pandas as pd
= pd.DataFrame({'A': [1 , 2, None, 2],
dataset 'B': ['red', 'red', 'yellow', 'green'],
'C': [3300, 1250, 4600, 4500],
'D': ['MSK', 'SPB', 'EKB', 'MSK']})
dataset
A | B | C | D | |
---|---|---|---|---|
0 | 1.0 | red | 3300 | MSK |
1 | 2.0 | red | 1250 | SPB |
2 | NaN | yellow | 4600 | EKB |
3 | 2.0 | green | 4500 | MSK |
# OHE encoding
= pd.get_dummies(dataset, columns = ['B'])
dataset dataset
A | C | D | B_green | B_red | B_yellow | |
---|---|---|---|---|---|---|
0 | 1.0 | 3300 | MSK | 0 | 1 | 0 |
1 | 2.0 | 1250 | SPB | 0 | 1 | 0 |
2 | NaN | 4600 | EKB | 0 | 0 | 1 |
3 | 2.0 | 4500 | MSK | 1 | 0 | 0 |
# Label encoding
from sklearn import preprocessing
= preprocessing.LabelEncoder()
le 'D']) le.fit(dataset[
LabelEncoder()
'D'] = le.transform(dataset['D'])
dataset[ dataset
A | C | D | B_green | B_red | B_yellow | |
---|---|---|---|---|---|---|
0 | 1.0 | 3300 | 1 | 0 | 1 | 0 |
1 | 2.0 | 1250 | 2 | 0 | 1 | 0 |
2 | NaN | 4600 | 0 | 0 | 0 | 1 |
3 | 2.0 | 4500 | 1 | 1 | 0 | 0 |
# Заполняем пропущенные данные
'A'] = dataset['A'].fillna(np.mean(dataset['A']))
dataset[ dataset
A | C | D | B_green | B_red | B_yellow | |
---|---|---|---|---|---|---|
0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 |
1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 |
2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 |
3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 |
'C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())
dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()
dataset[ dataset
A | C | D | B_green | B_red | B_yellow | C_normalized | C_standardized | |
---|---|---|---|---|---|---|---|---|
0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 | 0.611940 | -0.072209 |
1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 | 0.000000 | -1.388018 |
2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 | 1.000000 | 0.762206 |
3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 | 0.970149 | 0.698021 |