Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

102 строки
3.9 KiB
Python

import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import distance
# Загрузка датасета MLB
mlb_data = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t')
mlb_data.columns = ['Name', 'Team', 'Position', 'Height', 'Weight', 'Age']
# Создаем DataFrame аналогично ирисам
mlb_pd = pd.DataFrame(data=np.c_[mlb_data[['Height', 'Weight', 'Age']]],
columns=['Height (inches)', 'Weight (pounds)', 'Age (years)'])
# Выбираем данные для кластеризации
data = mlb_pd[['Height (inches)', 'Weight (pounds)', 'Age (years)']].to_numpy()
# расчет матрицы расстояний Чебышева
distance_matrix = np.zeros((len(data), len(data)))
for i in range(len(data)):
for j in range(i+1, len(data)):
distance_matrix[i][j] = distance.chebyshev(data[i], data[j])
distance_matrix[j][i] = distance_matrix[i][j]
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = np.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = np.column_stack(
[model.children_, model.distances_, counts]
).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
# setting distance_threshold=0 ensures we compute the full tree.
metric = 'precomputed'
linkage = "single"
model = AgglomerativeClustering(compute_distances=True, metric=metric, linkage=linkage)
model = model.fit(distance_matrix)
print("Labels:", model.labels_)
print("Number of clusters:", len(np.unique(model.labels_)))
# Визуализация дендрограммы
plt.figure(figsize=(12, 6))
plt.title('Hierarchical Clustering Dendrogram - MLB Data \n metric="{}", linkage="{}"'.format(metric, linkage))
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node")
plt.ylabel("Distance")
plt.show()
# 3D визуализация кластеров
fig1 = plt.figure(figsize=(12, 8))
ax = plt.axes(projection='3d')
scatter = ax.scatter3D(mlb_pd['Height (inches)'],
mlb_pd['Weight (pounds)'],
mlb_pd['Age (years)'],
c=model.labels_,
cmap='tab10',
s=50,
alpha=0.7)
ax.set_title('Agglomerative Clustering - MLB Players \n metric="{}", linkage="{}"'.format(metric, linkage))
ax.set_xlabel('Height (inches)')
ax.set_ylabel('Weight (pounds)')
ax.set_zlabel('Age (years)')
# Добавляем цветовую легенду
plt.colorbar(scatter, ax=ax, label='Cluster')
plt.show()
# Дополнительная информация о кластерах
print("\nИнформация о кластерах:")
for cluster_id in np.unique(model.labels_):
cluster_data = mlb_pd[model.labels_ == cluster_id]
print(f"Cluster {cluster_id}: {len(cluster_data)} players")
print(f" Средний рост: {cluster_data['Height (inches)'].mean():.1f} inches")
print(f" Средний вес: {cluster_data['Weight (pounds)'].mean():.1f} pounds")
print(f" Средний возраст: {cluster_data['Age (years)'].mean():.1f} years")
print()
print("Распределение позиций по кластерам:")
mlb_data_with_clusters = mlb_data.copy()
mlb_data_with_clusters['Cluster'] = model.labels_