Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
307 KiB
307 KiB
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import silhouette_score
from bokeh.io import output_notebook, show
output_notebook()from bokeh.models import ColumnDataSource, HoverTool, BoxAnnotation
from bokeh.models.tools import *
from bokeh.plotting import figure
from bokeh.transform import factor_cmap, CategoricalColorMapper
from bokeh.palettes import Category20
"(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"<div style='background-color: #fdd'>\\n\"+\n \"<p>\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"</p>\\n\"+\n \"<ul>\\n\"+\n \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n \"<li>use INLINE resources instead, as so:</li>\\n\"+\n \"</ul>\\n\"+\n \"<code>\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"</code>\\n\"+\n \"</div>\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"ef503a75-3e3b-446c-9160-ddfe41cc28eb\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.1.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"ef503a75-3e3b-446c-9160-ddfe41cc28eb\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));"
#X,y = datasets.make_moons(n_samples=100, random_state = 42, noise = 0.1 )
= datasets.make_blobs(n_samples=100, centers = 6, random_state =45 )
X,y
0], X[:,1])
plt.scatter (X[:, plt.show()
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import pandas as pd
# Реализация иерархической кластеризации при помощи функции linkage
= linkage(X, method='ward')
mergings
# Строим дендрограмму, указав параметры удобные для отображения
=(15, 10))
plt.figure(figsize
dendrogram(mergings)
plt.show()
= fcluster(mergings,10, 'distance') # distance or maxclust
T print (T)
[1 2 3 1 5 5 1 6 6 4 1 6 2 2 2 2 2 1 3 4 5 2 3 6 5 4 3 6 5 4 6 6 5 1 4 3 3
1 6 6 3 2 1 4 5 3 5 5 1 6 4 2 5 6 2 4 2 4 4 1 1 3 1 4 1 5 2 6 3 6 6 2 3 3
1 2 5 5 4 5 2 4 4 3 4 2 3 6 2 3 6 3 5 1 6 3 5 4 4 1]
=(5, 5))
plt.figure(figsize0], X[:,1], c=T)
plt.scatter (X[:, plt.show()
def update_cluster_centers(X, c, num_clusters):
for cl in range(1, num_clusters+1):
= np.where(c==cl)
ix -1,:] = np.mean(X[ix,:], axis=1)
mu[clreturn mu
= len(set(T))
num_clusters = np.zeros([num_clusters,2])
mu = update_cluster_centers(X, T, num_clusters)
mu print(mu)
[[ 8.20412426 2.061867 ]
[-6.74804529 -7.51382813]
[-4.16045299 1.00683372]]
= (5,5))
plt.figure(figsize 0], X[:,1], c=T)
plt.scatter (X[:,0],mu[:,1], c = 'red', marker = 'o')
plt.scatter(mu[:, plt.show()
Bokeh plots
=pd.DataFrame({'x' : X[:,0],
data_df 'y' : X[:,1],
'cluster' : T})
'cluster']=data_df['cluster'].astype('str') data_df[
= Category20[len(data_df['cluster'].unique())]
palette = CategoricalColorMapper(factors=data_df['cluster'].unique(),
color_map =palette) palette
=figure(sizing_mode="stretch_width", max_width=1800, height=500, title = 'Clustering')
p= ColumnDataSource(data_df)
source =source, x='x', y='y',color={'field': 'cluster', 'transform': color_map}, width = 3)
p.scatter(source0],mu[:,1], color = 'red', width = 5)
p.scatter(mu[:,
=[("x", "@x"), ("y", "@y"), ("cluster", "@cluster")]))
p.add_tools(HoverTool(tooltips
show(p)
""
Сумма квадратов расстояний до центроида
#Сумма квадратов расстояний до центроида
=0
cluster_distfor j in range(0, np.shape(mu)[0]):
= 0
summ = np.where(T==j+1)
obj for i in range(0, np.shape(obj)[1]):
#print(euclidean_distances(mu[j].reshape(1,-1), X[obj[0][i],:].reshape(1,-1)))
= summ + (euclidean_distances(mu[j].reshape(1,-1), X[obj[0][i],:].reshape(1,-1)))**2
summ if(summ>0):
= summ
summ = cluster_dist + summ
cluster_dist print(j,' custer dist: ', summ)
print ("Summary of squared cluster dist: ", cluster_dist)
print ("Mean summary of squared cluster dist: ", cluster_dist / np.shape(mu)[0])
0 custer dist: [[140.52532826]]
1 custer dist: [[284.50246387]]
2 custer dist: [[445.8365626]]
Summary of squared cluster dist: [[870.86435472]]
Mean summary of squared cluster dist: [[290.28811824]]
K-means
from sklearn.cluster import KMeans
= KMeans(n_clusters=4, n_init=10)
model
model.fit(X)= model.predict(X)
all_predictions print (all_predictions)
[1 1 0 1 2 2 1 3 3 0 1 3 1 1 1 1 1 1 0 0 2 1 0 3 2 0 0 3 2 0 3 3 2 1 0 0 0
1 3 3 0 1 1 0 2 0 2 2 1 3 0 1 2 3 1 0 1 0 0 1 1 0 1 0 1 2 1 3 0 3 3 1 0 0
1 1 2 2 0 2 1 0 0 0 0 1 0 3 1 0 3 0 2 1 3 0 2 0 0 1]
= (5,5))
plt.figure(figsize 0], X[:,1], c=all_predictions)
plt.scatter (X[:, plt.show()
print('Sum of squared distances of samples to their closest cluster center.:', model.inertia_)
Sum of squared distances of samples to their closest cluster center.: 472.08573606137327
= []
inertia = []
silhouette for k in range(2,10):
= KMeans(n_clusters=k, random_state=1, n_init=10).fit(X)
kmeans
inertia.append((kmeans.inertia_))= kmeans.predict(X)
all_predictions
silhouette.append(silhouette_score(X, all_predictions))print (inertia)
[2194.619727635032, 870.8643547241178, 472.08573606137327, 259.5795564009951, 176.1282308577753, 153.70943167635176, 143.24864846484422, 125.24089654012067]
=(5,5))
plt.figure(figsizerange (2,10), inertia, marker='s')
plt.plot(
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
=(5,5))
plt.figure(figsizerange (2,10), silhouette, marker='s')
plt.plot(
plt.show
<function matplotlib.pyplot.show(close=None, block=None)>
= KMeans(n_clusters=5, random_state=1, n_init=10).fit(X)
kmeans 0], X[:,1], c=kmeans.predict(X))
plt.scatter (X[:, plt.show()
DBSCAN
from sklearn.cluster import DBSCAN
#db = DBSCAN(eps=0.3, min_samples=7).fit(X) # параметры для make_moons
= DBSCAN(eps=1.2, min_samples=6).fit(X)
db = db.labels_
labels = len(set(labels)) - (1 if -1 in labels else 0)
n_clusters_ = list(labels).count(-1)
n_noise_
print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
labels
Estimated number of clusters: 5
Estimated number of noise points: 21
array([ 0, 0, 1, 0, 2, 2, 0, 3, 3, 4, 0, 3, 0, -1, 0, 0, 0,
0, 1, 4, 2, 0, -1, 3, 2, -1, 1, 3, 2, 4, -1, 3, 2, 0,
4, 1, 1, -1, -1, -1, -1, -1, 0, 4, 2, 1, -1, 2, 0, 3, 4,
0, 2, 3, 0, -1, -1, 4, -1, 0, 0, 1, 0, 4, 0, 2, 0, 3,
1, 3, 3, -1, -1, -1, 0, 0, 2, 2, -1, 2, 0, -1, 4, 1, 4,
0, 1, -1, 0, 1, 3, 1, 2, 0, -1, 1, 2, 4, -1, 0],
dtype=int64)
0], X[:,1], c=labels)
plt.scatter (X[:, plt.show()