Files
TDA/lections/notebooks/lec2_preprocess.ipynb
Andrey f3ceb052a3 lec02
2024-02-20 09:25:04 +03:00

36 KiB

import numpy as np 
import sklearn.metrics.pairwise as pw
A = [[3, 4000, 5]]
B = [[3, 4000, 4]]
C = [[3, 4100, 5]]
print('Euclidean: \t',pw.euclidean_distances(A, B))
print('Cosine: \t',pw.cosine_similarity(A, B))
print('Manhattan: \t',pw.manhattan_distances(A, B))
Euclidean: 	 [[1.]]
Cosine: 	 [[0.99999997]]
Manhattan: 	 [[1.]]
print('Euclidean: \t',pw.euclidean_distances(A, C))
print('Cosine: \t',pw.cosine_similarity(A, C))
print('Manhattan: \t',pw.manhattan_distances(A, C))
Euclidean: 	 [[100.]]
Cosine: 	 [[1.]]
Manhattan: 	 [[100.]]
import sklearn.metrics.pairwise as pw

D = [[6,0,0,3,3]]
E = [[3,0,0,2,2]]
F = [[1,1,1,1,1]]

print('Euclidean E-D: \t',pw.euclidean_distances(D, E))
print('Euclidean E-F: \t',pw.euclidean_distances(E, F))

print('\nCosine: E-D \t',pw.cosine_similarity(D, E))
print('Cosine E-F: \t',pw.cosine_similarity(E, F))

print('\nManhattan: E-D \t',pw.manhattan_distances(D, E))
print('Manhattan E-F: \t',pw.manhattan_distances(E, F))
Euclidean E-D: 	 [[3.31662479]]
Euclidean E-F: 	 [[2.82842712]]

Cosine: E-D 	 [[0.99014754]]
Cosine E-F: 	 [[0.7592566]]

Manhattan: E-D 	 [[5.]]
Manhattan E-F: 	 [[6.]]
import pandas as pd
dataset = pd.DataFrame({'A': [1 , 2, None, 2], 
                        'B': ['red', 'red', 'yellow', 'green'], 
                        'C': [3300, 1250, 4600, 4500],
                        'D': ['MSK', 'SPB', 'EKB', 'MSK']})
dataset
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
A B C D
0 1.0 red 3300 MSK
1 2.0 red 1250 SPB
2 NaN yellow 4600 EKB
3 2.0 green 4500 MSK
# OHE encoding
dataset = pd.get_dummies(dataset, columns = ['B'])
dataset
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
A C D B_green B_red B_yellow
0 1.0 3300 MSK 0 1 0
1 2.0 1250 SPB 0 1 0
2 NaN 4600 EKB 0 0 1
3 2.0 4500 MSK 1 0 0
# Label encoding
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(dataset['D'])
<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: "▸";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: "▾";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: "";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style>
LabelEncoder()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LabelEncoder()
dataset['D'] = le.transform(dataset['D'])
dataset
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
A C D B_green B_red B_yellow
0 1.0 3300 1 0 1 0
1 2.0 1250 2 0 1 0
2 NaN 4600 0 0 0 1
3 2.0 4500 1 1 0 0
# Заполняем пропущенные данные
dataset['A'] = dataset['A'].fillna(np.mean(dataset['A']))
dataset
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
A C D B_green B_red B_yellow
0 1.000000 3300 1 0 1 0
1 2.000000 1250 2 0 1 0
2 1.666667 4600 0 0 0 1
3 2.000000 4500 1 1 0 0
dataset['C_normalized'] = (dataset['C'] - dataset['C'].min()) / (dataset['C'].max() - dataset['C'].min())
dataset['C_standardized'] = (dataset['C'] - dataset['C'].mean()) / dataset['C'].std()
dataset
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; } </style>
A C D B_green B_red B_yellow C_normalized C_standardized
0 1.000000 3300 1 0 1 0 0.611940 -0.072209
1 2.000000 1250 2 0 1 0 0.000000 -1.388018
2 1.666667 4600 0 0 0 1 1.000000 0.762206
3 2.000000 4500 1 1 0 0 0.970149 0.698021
dataset.boxplot(['C_normalized','C_standardized'])
<Axes: >