Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

307 KiB

import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import silhouette_score
from bokeh.io import output_notebook, show
output_notebook()
from bokeh.models import ColumnDataSource, HoverTool, BoxAnnotation
from bokeh.models.tools import * 
from bokeh.plotting import figure

from bokeh.transform import factor_cmap, CategoricalColorMapper
from bokeh.palettes import Category20
Loading BokehJS ...
"(function(root) {\n  function now() {\n    return new Date();\n  }\n\n  const force = true;\n\n  if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n    root._bokeh_onload_callbacks = [];\n    root._bokeh_is_loading = undefined;\n  }\n\n\n  if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n    root._bokeh_timeout = Date.now() + 5000;\n    root._bokeh_failed_load = false;\n  }\n\n  const NB_LOAD_WARNING = {'data': {'text/html':\n     \"<div style='background-color: #fdd'>\\n\"+\n     \"<p>\\n\"+\n     \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n     \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n     \"</p>\\n\"+\n     \"<ul>\\n\"+\n     \"<li>re-rerun `output_notebook()` to attempt to load from CDN again, or</li>\\n\"+\n     \"<li>use INLINE resources instead, as so:</li>\\n\"+\n     \"</ul>\\n\"+\n     \"<code>\\n\"+\n     \"from bokeh.resources import INLINE\\n\"+\n     \"output_notebook(resources=INLINE)\\n\"+\n     \"</code>\\n\"+\n     \"</div>\"}};\n\n  function display_loaded() {\n    const el = document.getElementById(\"ef503a75-3e3b-446c-9160-ddfe41cc28eb\");\n    if (el != null) {\n      el.textContent = \"BokehJS is loading...\";\n    }\n    if (root.Bokeh !== undefined) {\n      if (el != null) {\n        el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n      }\n    } else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(display_loaded, 100)\n    }\n  }\n\n  function run_callbacks() {\n    try {\n      root._bokeh_onload_callbacks.forEach(function(callback) {\n        if (callback != null)\n          callback();\n      });\n    } finally {\n      delete root._bokeh_onload_callbacks\n    }\n    console.debug(\"Bokeh: all callbacks have finished\");\n  }\n\n  function load_libs(css_urls, js_urls, callback) {\n    if (css_urls == null) css_urls = [];\n    if (js_urls == null) js_urls = [];\n\n    root._bokeh_onload_callbacks.push(callback);\n    if (root._bokeh_is_loading > 0) {\n      console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n      return null;\n    }\n    if (js_urls == null || js_urls.length === 0) {\n      run_callbacks();\n      return null;\n    }\n    console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n    root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n    function on_load() {\n      root._bokeh_is_loading--;\n      if (root._bokeh_is_loading === 0) {\n        console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n        run_callbacks()\n      }\n    }\n\n    function on_error(url) {\n      console.error(\"failed to load \" + url);\n    }\n\n    for (let i = 0; i < css_urls.length; i++) {\n      const url = css_urls[i];\n      const element = document.createElement(\"link\");\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.rel = \"stylesheet\";\n      element.type = \"text/css\";\n      element.href = url;\n      console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n      document.body.appendChild(element);\n    }\n\n    for (let i = 0; i < js_urls.length; i++) {\n      const url = js_urls[i];\n      const element = document.createElement('script');\n      element.onload = on_load;\n      element.onerror = on_error.bind(null, url);\n      element.async = false;\n      element.src = url;\n      console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n      document.head.appendChild(element);\n    }\n  };\n\n  function inject_raw_css(css) {\n    const element = document.createElement(\"style\");\n    element.appendChild(document.createTextNode(css));\n    document.body.appendChild(element);\n  }\n\n  const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.1.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.1.min.js\"];\n  const css_urls = [];\n\n  const inline_js = [    function(Bokeh) {\n      Bokeh.set_log_level(\"info\");\n    },\nfunction(Bokeh) {\n    }\n  ];\n\n  function run_inline_js() {\n    if (root.Bokeh !== undefined || force === true) {\n          for (let i = 0; i < inline_js.length; i++) {\n      inline_js[i].call(root, root.Bokeh);\n    }\nif (force === true) {\n        display_loaded();\n      }} else if (Date.now() < root._bokeh_timeout) {\n      setTimeout(run_inline_js, 100);\n    } else if (!root._bokeh_failed_load) {\n      console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n      root._bokeh_failed_load = true;\n    } else if (force !== true) {\n      const cell = $(document.getElementById(\"ef503a75-3e3b-446c-9160-ddfe41cc28eb\")).parents('.cell').data().cell;\n      cell.output_area.append_execute_result(NB_LOAD_WARNING)\n    }\n  }\n\n  if (root._bokeh_is_loading === 0) {\n    console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n    run_inline_js();\n  } else {\n    load_libs(css_urls, js_urls, function() {\n      console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n      run_inline_js();\n    });\n  }\n}(window));"

#X,y = datasets.make_moons(n_samples=100, random_state = 42, noise = 0.1 )
X,y = datasets.make_blobs(n_samples=100, centers = 6, random_state =45  )

plt.scatter (X[:,0], X[:,1])
plt.show()

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.pyplot as plt
import pandas as pd
 

# Реализация иерархической кластеризации при помощи функции linkage

mergings = linkage(X, method='ward')
 
# Строим дендрограмму, указав параметры удобные для отображения

plt.figure(figsize=(15, 10))
dendrogram(mergings)
plt.show()
 

T = fcluster(mergings,10, 'distance') # distance or maxclust
print (T)
[1 2 3 1 5 5 1 6 6 4 1 6 2 2 2 2 2 1 3 4 5 2 3 6 5 4 3 6 5 4 6 6 5 1 4 3 3
 1 6 6 3 2 1 4 5 3 5 5 1 6 4 2 5 6 2 4 2 4 4 1 1 3 1 4 1 5 2 6 3 6 6 2 3 3
 1 2 5 5 4 5 2 4 4 3 4 2 3 6 2 3 6 3 5 1 6 3 5 4 4 1]
plt.figure(figsize=(5, 5))
plt.scatter (X[:,0], X[:,1], c=T)
plt.show()

def update_cluster_centers(X, c, num_clusters):
    for cl in range(1, num_clusters+1):
            ix = np.where(c==cl)
            mu[cl-1,:] = np.mean(X[ix,:], axis=1)
    return mu
num_clusters = len(set(T))
mu = np.zeros([num_clusters,2])
mu = update_cluster_centers(X, T, num_clusters)
print(mu)
[[ 8.20412426  2.061867  ]
 [-6.74804529 -7.51382813]
 [-4.16045299  1.00683372]]
plt.figure(figsize = (5,5))
plt.scatter (X[:,0], X[:,1], c=T)
plt.scatter(mu[:,0],mu[:,1], c = 'red', marker = 'o')
plt.show()

Bokeh plots

data_df =pd.DataFrame({'x' : X[:,0], 
             'y' : X[:,1], 
             'cluster' : T})
data_df['cluster']=data_df['cluster'].astype('str')
palette = Category20[len(data_df['cluster'].unique())]
color_map = CategoricalColorMapper(factors=data_df['cluster'].unique(),
                                   palette=palette)
p=figure(sizing_mode="stretch_width", max_width=1800, height=500, title = 'Clustering')
source = ColumnDataSource(data_df)
p.scatter(source=source, x='x', y='y',color={'field': 'cluster', 'transform': color_map}, width = 3)
p.scatter(mu[:,0],mu[:,1], color = 'red', width = 5)

p.add_tools(HoverTool(tooltips=[("x", "@x"), ("y", "@y"), ("cluster", "@cluster")]))

show(p)
""

Сумма квадратов расстояний до центроида

#Сумма квадратов расстояний до центроида

cluster_dist=0
for j in range(0, np.shape(mu)[0]):
    summ = 0
    obj = np.where(T==j+1)
    for i in range(0, np.shape(obj)[1]):
        #print(euclidean_distances(mu[j].reshape(1,-1), X[obj[0][i],:].reshape(1,-1)))
        summ = summ + (euclidean_distances(mu[j].reshape(1,-1), X[obj[0][i],:].reshape(1,-1)))**2
    if(summ>0):
        summ = summ
    cluster_dist = cluster_dist + summ
    print(j,' custer dist: ', summ)
print ("Summary of squared cluster dist: ", cluster_dist)

print ("Mean summary of squared cluster dist: ", cluster_dist / np.shape(mu)[0])
0  custer dist:  [[140.52532826]]
1  custer dist:  [[284.50246387]]
2  custer dist:  [[445.8365626]]
Summary of squared cluster dist:  [[870.86435472]]
Mean summary of squared cluster dist:  [[290.28811824]]

K-means

from sklearn.cluster import KMeans
model = KMeans(n_clusters=4, n_init=10)
model.fit(X)
all_predictions = model.predict(X)
print (all_predictions)
[1 1 0 1 2 2 1 3 3 0 1 3 1 1 1 1 1 1 0 0 2 1 0 3 2 0 0 3 2 0 3 3 2 1 0 0 0
 1 3 3 0 1 1 0 2 0 2 2 1 3 0 1 2 3 1 0 1 0 0 1 1 0 1 0 1 2 1 3 0 3 3 1 0 0
 1 1 2 2 0 2 1 0 0 0 0 1 0 3 1 0 3 0 2 1 3 0 2 0 0 1]
plt.figure(figsize = (5,5))
plt.scatter (X[:,0], X[:,1], c=all_predictions)
plt.show()

print('Sum of squared distances of samples to their closest cluster center.:', model.inertia_)
Sum of squared distances of samples to their closest cluster center.: 472.08573606137327
inertia = []
silhouette = []
for k in range(2,10):
    kmeans = KMeans(n_clusters=k, random_state=1, n_init=10).fit(X)
    inertia.append((kmeans.inertia_))
    all_predictions = kmeans.predict(X)
    silhouette.append(silhouette_score(X, all_predictions))
print (inertia)
[2194.619727635032, 870.8643547241178, 472.08573606137327, 259.5795564009951, 176.1282308577753, 153.70943167635176, 143.24864846484422, 125.24089654012067]
plt.figure(figsize=(5,5))
plt.plot(range (2,10), inertia, marker='s')

plt.show
<function matplotlib.pyplot.show(close=None, block=None)>

plt.figure(figsize=(5,5))
plt.plot(range (2,10), silhouette, marker='s')

plt.show
<function matplotlib.pyplot.show(close=None, block=None)>

kmeans = KMeans(n_clusters=5, random_state=1, n_init=10).fit(X)
plt.scatter (X[:,0], X[:,1], c=kmeans.predict(X))
plt.show()

DBSCAN

from sklearn.cluster import DBSCAN
#db = DBSCAN(eps=0.3, min_samples=7).fit(X)  # параметры для make_moons 
db = DBSCAN(eps=1.2, min_samples=6).fit(X)
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print("Estimated number of clusters: %d" % n_clusters_)
print("Estimated number of noise points: %d" % n_noise_)
labels
Estimated number of clusters: 5
Estimated number of noise points: 21
array([ 0,  0,  1,  0,  2,  2,  0,  3,  3,  4,  0,  3,  0, -1,  0,  0,  0,
        0,  1,  4,  2,  0, -1,  3,  2, -1,  1,  3,  2,  4, -1,  3,  2,  0,
        4,  1,  1, -1, -1, -1, -1, -1,  0,  4,  2,  1, -1,  2,  0,  3,  4,
        0,  2,  3,  0, -1, -1,  4, -1,  0,  0,  1,  0,  4,  0,  2,  0,  3,
        1,  3,  3, -1, -1, -1,  0,  0,  2,  2, -1,  2,  0, -1,  4,  1,  4,
        0,  1, -1,  0,  1,  3,  1,  2,  0, -1,  1,  2,  4, -1,  0],
      dtype=int64)
plt.scatter (X[:,0], X[:,1], c=labels)
plt.show()