Labs/ИАД/lr2/task2_HW.py

import numpy as np
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram
import pandas as pd
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import distance

# Загрузка датасета MLB
mlb_data = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t')
mlb_data.columns = ['Name', 'Team', 'Position', 'Height', 'Weight', 'Age']

# Создаем DataFrame аналогично ирисам
mlb_pd = pd.DataFrame(data=np.c_[mlb_data[['Height', 'Weight', 'Age']]],
                      columns=['Height (inches)', 'Weight (pounds)', 'Age (years)'])

# Выбираем данные для кластеризации
data = mlb_pd[['Height (inches)', 'Weight (pounds)', 'Age (years)']].to_numpy()

# расчет матрицы расстояний Чебышева
distance_matrix = np.zeros((len(data), len(data)))
for i in range(len(data)):
    for j in range(i+1, len(data)):
        distance_matrix[i][j] = distance.chebyshev(data[i], data[j])
        distance_matrix[j][i] = distance_matrix[i][j]

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

# setting distance_threshold=0 ensures we compute the full tree.
metric = 'precomputed'
linkage = "single"
model = AgglomerativeClustering(compute_distances=True, metric=metric, linkage=linkage)

model = model.fit(distance_matrix)
print("Labels:", model.labels_)
print("Number of clusters:", len(np.unique(model.labels_)))

# Визуализация дендрограммы
plt.figure(figsize=(12, 6))
plt.title('Hierarchical Clustering Dendrogram - MLB Data \n metric="{}", linkage="{}"'.format(metric, linkage))
# plot the top three levels of the dendrogram
plot_dendrogram(model, truncate_mode="level", p=3)
plt.xlabel("Number of points in node")
plt.ylabel("Distance")
plt.show()

# 3D визуализация кластеров
fig1 = plt.figure(figsize=(12, 8))
ax = plt.axes(projection='3d')

scatter = ax.scatter3D(mlb_pd['Height (inches)'],
                       mlb_pd['Weight (pounds)'],
                       mlb_pd['Age (years)'],
                       c=model.labels_,
                       cmap='tab10',
                       s=50,
                       alpha=0.7)

ax.set_title('Agglomerative Clustering - MLB Players \n metric="{}", linkage="{}"'.format(metric, linkage))
ax.set_xlabel('Height (inches)')
ax.set_ylabel('Weight (pounds)')
ax.set_zlabel('Age (years)')

# Добавляем цветовую легенду
plt.colorbar(scatter, ax=ax, label='Cluster')

plt.show()

# Дополнительная информация о кластерах
print("\nИнформация о кластерах:")
for cluster_id in np.unique(model.labels_):
    cluster_data = mlb_pd[model.labels_ == cluster_id]
    print(f"Cluster {cluster_id}: {len(cluster_data)} players")
    print(f"  Средний рост: {cluster_data['Height (inches)'].mean():.1f} inches")
    print(f"  Средний вес: {cluster_data['Weight (pounds)'].mean():.1f} pounds")
    print(f"  Средний возраст: {cluster_data['Age (years)'].mean():.1f} years")
    print()

print("Распределение позиций по кластерам:")
mlb_data_with_clusters = mlb_data.copy()
mlb_data_with_clusters['Cluster'] = model.labels_