Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
36 KiB
36 KiB
import numpy as np
import sklearn.metrics.pairwise as pw
= [[3, 4000, 5]]
A = [[3, 4000, 4]]
B = [[3, 4100, 5]] C
print('Euclidean: \t',pw.euclidean_distances(A, B))
print('Cosine: \t',pw.cosine_similarity(A, B))
print('Manhattan: \t',pw.manhattan_distances(A, B))
Euclidean: [[1.]]
Cosine: [[0.99999997]]
Manhattan: [[1.]]
print('Euclidean: \t',pw.euclidean_distances(A, C))
print('Cosine: \t',pw.cosine_similarity(A, C))
print('Manhattan: \t',pw.manhattan_distances(A, C))
Euclidean: [[100.]]
Cosine: [[1.]]
Manhattan: [[100.]]
import sklearn.metrics.pairwise as pw
= [[6,0,0,3,3]]
D = [[3,0,0,2,2]]
E = [[1,1,1,1,1]]
F
print('Euclidean E-D: \t',pw.euclidean_distances(D, E))
print('Euclidean E-F: \t',pw.euclidean_distances(E, F))
print('\nCosine: E-D \t',pw.cosine_similarity(D, E))
print('Cosine E-F: \t',pw.cosine_similarity(E, F))
print('\nManhattan: E-D \t',pw.manhattan_distances(D, E))
print('Manhattan E-F: \t',pw.manhattan_distances(E, F))
Euclidean E-D: [[3.31662479]]
Euclidean E-F: [[2.82842712]]
Cosine: E-D [[0.99014754]]
Cosine E-F: [[0.7592566]]
Manhattan: E-D [[5.]]
Manhattan E-F: [[6.]]
import pandas as pd
= pd.DataFrame({'A': [1 , 2, None, 2],
dataset 'B': ['red', 'red', 'yellow', 'green'],
'C': [3300, 1250, 4600, 4500],
'D': ['MSK', 'SPB', 'EKB', 'MSK']})
dataset
A | B | C | D | |
---|---|---|---|---|
0 | 1.0 | red | 3300 | MSK |
1 | 2.0 | red | 1250 | SPB |
2 | NaN | yellow | 4600 | EKB |
3 | 2.0 | green | 4500 | MSK |
# OHE encoding
= pd.get_dummies(dataset, columns = ['B'])
dataset dataset
A | C | D | B_green | B_red | B_yellow | |
---|---|---|---|---|---|---|
0 | 1.0 | 3300 | MSK | 0 | 1 | 0 |
1 | 2.0 | 1250 | SPB | 0 | 1 | 0 |
2 | NaN | 4600 | EKB | 0 | 0 | 1 |
3 | 2.0 | 4500 | MSK | 1 | 0 | 0 |
# Label encoding
from sklearn import preprocessing
= preprocessing.LabelEncoder()
le 'D']) le.fit(dataset[
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
'D'] = le.transform(dataset['D'])
dataset[ dataset
A | C | D | B_green | B_red | B_yellow | |
---|---|---|---|---|---|---|
0 | 1.0 | 3300 | 1 | 0 | 1 | 0 |
1 | 2.0 | 1250 | 2 | 0 | 1 | 0 |
2 | NaN | 4600 | 0 | 0 | 0 | 1 |
3 | 2.0 | 4500 | 1 | 1 | 0 | 0 |
# Заполняем пропущенные данные
'A'] = dataset['A'].fillna(np.mean(dataset['A']))
dataset[ dataset
A | C | D | B_green | B_red | B_yellow | |
---|---|---|---|---|---|---|
0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 |
1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 |
2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 |
3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 |
'C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())
dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()
dataset[ dataset
A | C | D | B_green | B_red | B_yellow | C_normalized | C_standardized | |
---|---|---|---|---|---|---|---|---|
0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 | 0.611940 | -0.072209 |
1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 | 0.000000 | -1.388018 |
2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 | 1.000000 | 0.762206 |
3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 | 0.970149 | 0.698021 |
'C_normalized','C_standardized']) dataset.boxplot([
<Axes: >