Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
251 KiB
251 KiB
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import silhouette_score
#X,y = datasets.make_moons(n_samples=100, random_state = 42, noise = 0.1 )
= datasets.make_blobs(n_samples=100, centers = 6, random_state =45 )
X,y
0], X[:,1])
plt.scatter (X[:, plt.show()
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import pandas as pd
# Реализация иерархической кластеризации при помощи функции linkage
= linkage(X, method='single')
mergings
# Строим дендрограмму, указав параметры удобные для отображения
=(15, 10))
plt.figure(figsize
dendrogram(mergings)
plt.show()
= fcluster(mergings, 10, 'distance') # distance or maxclust
T print (T)
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
=(5, 5))
plt.figure(figsize0], X[:,1], c=T)
plt.scatter (X[:, plt.show()
def update_cluster_centers(X, c):
= np.where(c==1)
ix 0,:] = np.mean(X[ix,:], axis=1)
mu[= np.where(c==2)
ix 1,:] = np.mean(X[ix,:], axis=1)
mu[= np.where(c==3)
ix 2,:] = np.mean(X[ix,:], axis=1)
mu[= np.where(c==4)
ix 3,:] = np.mean(X[ix,:], axis=1)
mu[return mu
= np.array([[0.0,0], [0,0], [0,0], [0,0]])
mu = update_cluster_centers(X, T)
mu print(mu)
[[-0.95992388 -1.54203033]
[ nan nan]
[ nan nan]
[ nan nan]]
C:\Users\Андрей\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\core\fromnumeric.py:3432: RuntimeWarning: Mean of empty slice.
return _methods._mean(a, axis=axis, dtype=dtype,
C:\Users\Андрей\AppData\Local\Programs\Python\Python39\lib\site-packages\numpy\core\_methods.py:182: RuntimeWarning: invalid value encountered in divide
ret = um.true_divide(
= (5,5))
plt.figure(figsize 0], X[:,1], c=T)
plt.scatter (X[:,0],mu[:,1], c = 'red', marker = 'o')
plt.scatter(mu[:, plt.show()
#Сумма квадратов расстояний до центроида
=0
cluster_distfor j in range(0, np.shape(mu)[0]):
= 0
summ = np.where(T==j+1)
obj for i in range(0, np.shape(obj)[1]):
#print(euclidean_distances(mu[j].reshape(1,-1), X[obj[0][i],:].reshape(1,-1)))
= summ + (euclidean_distances(mu[j].reshape(1,-1), X[obj[0][i],:].reshape(1,-1)))**2
summ if(summ>0):
= summ
summ = cluster_dist + summ
cluster_dist print(j,' custer dist: ', summ)
print ("Summary of squared cluster dist: ", cluster_dist)
print ("Mean summary of squared cluster dist: ", cluster_dist / np.shape(mu)[0])
0 custer dist: [[6974.82709438]]
1 custer dist: 0
2 custer dist: 0
3 custer dist: 0
Summary of squared cluster dist: [[6974.82709438]]
Mean summary of squared cluster dist: [[1743.70677359]]
K-means
from sklearn.cluster import KMeans
= KMeans(n_clusters=4, n_init=10)
model
model.fit(X)= model.predict(X)
all_predictions print (all_predictions)
[1 1 2 1 3 3 1 0 0 2 1 0 1 1 1 1 1 1 2 2 3 1 2 0 3 2 2 0 3 2 0 0 3 1 2 2 2
1 0 0 2 1 1 2 3 2 3 3 1 0 2 1 3 0 1 2 1 2 2 1 1 2 1 2 1 3 1 0 2 0 0 1 2 2
1 1 3 3 2 3 1 2 2 2 2 1 2 0 1 2 0 2 3 1 0 2 3 2 2 1]
= (5,5))
plt.figure(figsize 0], X[:,1], c=all_predictions)
plt.scatter (X[:, plt.show()
print('Sum of squared distances of samples to their closest cluster center.:', model.inertia_)
Sum of squared distances of samples to their closest cluster center.: 472.08573606137327
= []
inertia for k in range(1,10):
= KMeans(n_clusters=k, random_state=1, n_init=10).fit(X)
kmeans
inertia.append((kmeans.inertia_))print (inertia)
[6974.82709437959, 2194.619727635032, 870.8643547241178, 472.08573606137327, 259.5795564009951, 176.1282308577753, 153.70943167635176, 143.24864846484422, 125.24089654012067]
=(5,5))
plt.figure(figsizerange (1,10), inertia, marker='s')
plt.plot(
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
= KMeans(n_clusters=6, random_state=1, n_init=10).fit(X)
kmeans 0], X[:,1], c=kmeans.predict(X))
plt.scatter (X[:, plt.show()
DBSCAN
from sklearn.cluster import DBSCAN
#db = DBSCAN(eps=0.3, min_samples=7).fit(X) # параметры для make_moons
= DBSCAN(eps=1.2, min_samples=6).fit(X)
db = db.labels_
labels = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_ = list(labels).count(-1)
n_noise_
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
labels
Estimated number of clusters: 5
Estimated number of noise points: 21
array([ 0, 0, 1, 0, 2, 2, 0, 3, 3, 4, 0, 3, 0, -1, 0, 0, 0,
0, 1, 4, 2, 0, -1, 3, 2, -1, 1, 3, 2, 4, -1, 3, 2, 0,
4, 1, 1, -1, -1, -1, -1, -1, 0, 4, 2, 1, -1, 2, 0, 3, 4,
0, 2, 3, 0, -1, -1, 4, -1, 0, 0, 1, 0, 4, 0, 2, 0, 3,
1, 3, 3, -1, -1, -1, 0, 0, 2, 2, -1, 2, 0, -1, 4, 1, 4,
0, 1, -1, 0, 1, 3, 1, 2, 0, -1, 1, 2, 4, -1, 0],
dtype=int64)
0], X[:,1], c=labels)
plt.scatter (X[:, plt.show()
silhouette_score(X,labels)
0.4595289684168414