Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
102 строки
3.9 KiB
Python
102 строки
3.9 KiB
Python
import numpy as np
|
|
from matplotlib import pyplot as plt
|
|
from scipy.cluster.hierarchy import dendrogram
|
|
import pandas as pd
|
|
from sklearn.cluster import AgglomerativeClustering
|
|
from scipy.spatial import distance
|
|
|
|
# Загрузка датасета MLB
|
|
mlb_data = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t')
|
|
mlb_data.columns = ['Name', 'Team', 'Position', 'Height', 'Weight', 'Age']
|
|
|
|
# Создаем DataFrame аналогично ирисам
|
|
mlb_pd = pd.DataFrame(data=np.c_[mlb_data[['Height', 'Weight', 'Age']]],
|
|
columns=['Height (inches)', 'Weight (pounds)', 'Age (years)'])
|
|
|
|
# Выбираем данные для кластеризации
|
|
data = mlb_pd[['Height (inches)', 'Weight (pounds)', 'Age (years)']].to_numpy()
|
|
|
|
# расчет матрицы расстояний Чебышева
|
|
distance_matrix = np.zeros((len(data), len(data)))
|
|
for i in range(len(data)):
|
|
for j in range(i+1, len(data)):
|
|
distance_matrix[i][j] = distance.chebyshev(data[i], data[j])
|
|
distance_matrix[j][i] = distance_matrix[i][j]
|
|
|
|
def plot_dendrogram(model, **kwargs):
|
|
# Create linkage matrix and then plot the dendrogram
|
|
|
|
# create the counts of samples under each node
|
|
counts = np.zeros(model.children_.shape[0])
|
|
n_samples = len(model.labels_)
|
|
for i, merge in enumerate(model.children_):
|
|
current_count = 0
|
|
for child_idx in merge:
|
|
if child_idx < n_samples:
|
|
current_count += 1 # leaf node
|
|
else:
|
|
current_count += counts[child_idx - n_samples]
|
|
counts[i] = current_count
|
|
|
|
linkage_matrix = np.column_stack(
|
|
[model.children_, model.distances_, counts]
|
|
).astype(float)
|
|
|
|
# Plot the corresponding dendrogram
|
|
dendrogram(linkage_matrix, **kwargs)
|
|
|
|
# setting distance_threshold=0 ensures we compute the full tree.
|
|
metric = 'precomputed'
|
|
linkage = "single"
|
|
model = AgglomerativeClustering(compute_distances=True, metric=metric, linkage=linkage)
|
|
|
|
model = model.fit(distance_matrix)
|
|
print("Labels:", model.labels_)
|
|
print("Number of clusters:", len(np.unique(model.labels_)))
|
|
|
|
# Визуализация дендрограммы
|
|
plt.figure(figsize=(12, 6))
|
|
plt.title('Hierarchical Clustering Dendrogram - MLB Data \n metric="{}", linkage="{}"'.format(metric, linkage))
|
|
# plot the top three levels of the dendrogram
|
|
plot_dendrogram(model, truncate_mode="level", p=3)
|
|
plt.xlabel("Number of points in node")
|
|
plt.ylabel("Distance")
|
|
plt.show()
|
|
|
|
# 3D визуализация кластеров
|
|
fig1 = plt.figure(figsize=(12, 8))
|
|
ax = plt.axes(projection='3d')
|
|
|
|
scatter = ax.scatter3D(mlb_pd['Height (inches)'],
|
|
mlb_pd['Weight (pounds)'],
|
|
mlb_pd['Age (years)'],
|
|
c=model.labels_,
|
|
cmap='tab10',
|
|
s=50,
|
|
alpha=0.7)
|
|
|
|
ax.set_title('Agglomerative Clustering - MLB Players \n metric="{}", linkage="{}"'.format(metric, linkage))
|
|
ax.set_xlabel('Height (inches)')
|
|
ax.set_ylabel('Weight (pounds)')
|
|
ax.set_zlabel('Age (years)')
|
|
|
|
# Добавляем цветовую легенду
|
|
plt.colorbar(scatter, ax=ax, label='Cluster')
|
|
|
|
plt.show()
|
|
|
|
# Дополнительная информация о кластерах
|
|
print("\nИнформация о кластерах:")
|
|
for cluster_id in np.unique(model.labels_):
|
|
cluster_data = mlb_pd[model.labels_ == cluster_id]
|
|
print(f"Cluster {cluster_id}: {len(cluster_data)} players")
|
|
print(f" Средний рост: {cluster_data['Height (inches)'].mean():.1f} inches")
|
|
print(f" Средний вес: {cluster_data['Weight (pounds)'].mean():.1f} pounds")
|
|
print(f" Средний возраст: {cluster_data['Age (years)'].mean():.1f} years")
|
|
print()
|
|
|
|
print("Распределение позиций по кластерам:")
|
|
mlb_data_with_clusters = mlb_data.copy()
|
|
mlb_data_with_clusters['Cluster'] = model.labels_
|
|
|