Вы не можете выбрать более 25 тем
			Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
		
		
		
		
		
			
		
			
				
	
	
	
		
			36 KiB
		
	
	
	
			
		
		
	
	
			36 KiB
		
	
	
	
import numpy as np 
import sklearn.metrics.pairwise as pwA = [[3, 4000, 5]]
B = [[3, 4000, 4]]
C = [[3, 4100, 5]]print('Euclidean: \t',pw.euclidean_distances(A, B))
print('Cosine: \t',pw.cosine_similarity(A, B))
print('Manhattan: \t',pw.manhattan_distances(A, B))Euclidean: 	 [[1.]]
Cosine: 	 [[0.99999997]]
Manhattan: 	 [[1.]]
print('Euclidean: \t',pw.euclidean_distances(A, C))
print('Cosine: \t',pw.cosine_similarity(A, C))
print('Manhattan: \t',pw.manhattan_distances(A, C))Euclidean: 	 [[100.]]
Cosine: 	 [[1.]]
Manhattan: 	 [[100.]]
import sklearn.metrics.pairwise as pw
D = [[6,0,0,3,3]]
E = [[3,0,0,2,2]]
F = [[1,1,1,1,1]]
print('Euclidean E-D: \t',pw.euclidean_distances(D, E))
print('Euclidean E-F: \t',pw.euclidean_distances(E, F))
print('\nCosine: E-D \t',pw.cosine_similarity(D, E))
print('Cosine E-F: \t',pw.cosine_similarity(E, F))
print('\nManhattan: E-D \t',pw.manhattan_distances(D, E))
print('Manhattan E-F: \t',pw.manhattan_distances(E, F))Euclidean E-D: 	 [[3.31662479]]
Euclidean E-F: 	 [[2.82842712]]
Cosine: E-D 	 [[0.99014754]]
Cosine E-F: 	 [[0.7592566]]
Manhattan: E-D 	 [[5.]]
Manhattan E-F: 	 [[6.]]
import pandas as pd
dataset = pd.DataFrame({'A': [1 , 2, None, 2], 
                        'B': ['red', 'red', 'yellow', 'green'], 
                        'C': [3300, 1250, 4600, 4500],
                        'D': ['MSK', 'SPB', 'EKB', 'MSK']})
dataset| A | B | C | D | |
|---|---|---|---|---|
| 0 | 1.0 | red | 3300 | MSK | 
| 1 | 2.0 | red | 1250 | SPB | 
| 2 | NaN | yellow | 4600 | EKB | 
| 3 | 2.0 | green | 4500 | MSK | 
# OHE encoding
dataset = pd.get_dummies(dataset, columns = ['B'])
dataset| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 3300 | MSK | 0 | 1 | 0 | 
| 1 | 2.0 | 1250 | SPB | 0 | 1 | 0 | 
| 2 | NaN | 4600 | EKB | 0 | 0 | 1 | 
| 3 | 2.0 | 4500 | MSK | 1 | 0 | 0 | 
# Label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(dataset['D'])LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
dataset['D'] = le.transform(dataset['D'])
dataset| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 3300 | 1 | 0 | 1 | 0 | 
| 1 | 2.0 | 1250 | 2 | 0 | 1 | 0 | 
| 2 | NaN | 4600 | 0 | 0 | 0 | 1 | 
| 3 | 2.0 | 4500 | 1 | 1 | 0 | 0 | 
# Заполняем пропущенные данные
dataset['A'] = dataset['A'].fillna(np.mean(dataset['A']))
dataset| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 | 
| 1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 | 
| 2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 | 
| 3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 | 
dataset['C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())
dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()
dataset| A | C | D | B_green | B_red | B_yellow | C_normalized | C_standardized | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 | 0.611940 | -0.072209 | 
| 1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 | 0.000000 | -1.388018 | 
| 2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 | 1.000000 | 0.762206 | 
| 3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 | 0.970149 | 0.698021 | 
dataset.boxplot(['C_normalized','C_standardized'])<Axes: >
