36 KiB
36 KiB
import numpy as np
import sklearn.metrics.pairwise as pwA = [[3, 4000, 5]]
B = [[3, 4000, 4]]
C = [[3, 4100, 5]]print('Euclidean: \t',pw.euclidean_distances(A, B))
print('Cosine: \t',pw.cosine_similarity(A, B))
print('Manhattan: \t',pw.manhattan_distances(A, B))Euclidean: [[1.]]
Cosine: [[0.99999997]]
Manhattan: [[1.]]
print('Euclidean: \t',pw.euclidean_distances(A, C))
print('Cosine: \t',pw.cosine_similarity(A, C))
print('Manhattan: \t',pw.manhattan_distances(A, C))Euclidean: [[100.]]
Cosine: [[1.]]
Manhattan: [[100.]]
import sklearn.metrics.pairwise as pw
D = [[6,0,0,3,3]]
E = [[3,0,0,2,2]]
F = [[1,1,1,1,1]]
print('Euclidean E-D: \t',pw.euclidean_distances(D, E))
print('Euclidean E-F: \t',pw.euclidean_distances(E, F))
print('\nCosine: E-D \t',pw.cosine_similarity(D, E))
print('Cosine E-F: \t',pw.cosine_similarity(E, F))
print('\nManhattan: E-D \t',pw.manhattan_distances(D, E))
print('Manhattan E-F: \t',pw.manhattan_distances(E, F))Euclidean E-D: [[3.31662479]]
Euclidean E-F: [[2.82842712]]
Cosine: E-D [[0.99014754]]
Cosine E-F: [[0.7592566]]
Manhattan: E-D [[5.]]
Manhattan E-F: [[6.]]
import pandas as pd
dataset = pd.DataFrame({'A': [1 , 2, None, 2],
'B': ['red', 'red', 'yellow', 'green'],
'C': [3300, 1250, 4600, 4500],
'D': ['MSK', 'SPB', 'EKB', 'MSK']})
dataset
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| A | B | C | D | |
|---|---|---|---|---|
| 0 | 1.0 | red | 3300 | MSK |
| 1 | 2.0 | red | 1250 | SPB |
| 2 | NaN | yellow | 4600 | EKB |
| 3 | 2.0 | green | 4500 | MSK |
# OHE encoding
dataset = pd.get_dummies(dataset, columns = ['B'])
dataset
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 3300 | MSK | 0 | 1 | 0 |
| 1 | 2.0 | 1250 | SPB | 0 | 1 | 0 |
| 2 | NaN | 4600 | EKB | 0 | 0 | 1 |
| 3 | 2.0 | 4500 | MSK | 1 | 0 | 0 |
# Label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(dataset['D'])
<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: "▸";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: "▾";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: "";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style>
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
dataset['D'] = le.transform(dataset['D'])
dataset
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.0 | 3300 | 1 | 0 | 1 | 0 |
| 1 | 2.0 | 1250 | 2 | 0 | 1 | 0 |
| 2 | NaN | 4600 | 0 | 0 | 0 | 1 |
| 3 | 2.0 | 4500 | 1 | 1 | 0 | 0 |
# Заполняем пропущенные данные
dataset['A'] = dataset['A'].fillna(np.mean(dataset['A']))
dataset
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| A | C | D | B_green | B_red | B_yellow | |
|---|---|---|---|---|---|---|
| 0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 |
| 1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 |
| 2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 |
| 3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 |
dataset['C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())
dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()
dataset
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| A | C | D | B_green | B_red | B_yellow | C_normalized | C_standardized | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1.000000 | 3300 | 1 | 0 | 1 | 0 | 0.611940 | -0.072209 |
| 1 | 2.000000 | 1250 | 2 | 0 | 1 | 0 | 0.000000 | -1.388018 |
| 2 | 1.666667 | 4600 | 0 | 0 | 0 | 1 | 1.000000 | 0.762206 |
| 3 | 2.000000 | 4500 | 1 | 1 | 0 | 0 | 0.970149 | 0.698021 |
