import numpy as np from matplotlib import pyplot as plt from scipy.cluster.hierarchy import dendrogram import pandas as pd from sklearn.cluster import AgglomerativeClustering from scipy.spatial import distance # Загрузка датасета MLB mlb_data = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t') mlb_data.columns = ['Name', 'Team', 'Position', 'Height', 'Weight', 'Age'] # Создаем DataFrame аналогично ирисам mlb_pd = pd.DataFrame(data=np.c_[mlb_data[['Height', 'Weight', 'Age']]], columns=['Height (inches)', 'Weight (pounds)', 'Age (years)']) # Выбираем данные для кластеризации data = mlb_pd[['Height (inches)', 'Weight (pounds)', 'Age (years)']].to_numpy() # расчет матрицы расстояний Чебышева distance_matrix = np.zeros((len(data), len(data))) for i in range(len(data)): for j in range(i+1, len(data)): distance_matrix[i][j] = distance.chebyshev(data[i], data[j]) distance_matrix[j][i] = distance_matrix[i][j] def plot_dendrogram(model, **kwargs): # Create linkage matrix and then plot the dendrogram # create the counts of samples under each node counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) for i, merge in enumerate(model.children_): current_count = 0 for child_idx in merge: if child_idx < n_samples: current_count += 1 # leaf node else: current_count += counts[child_idx - n_samples] counts[i] = current_count linkage_matrix = np.column_stack( [model.children_, model.distances_, counts] ).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) # setting distance_threshold=0 ensures we compute the full tree. metric = 'precomputed' linkage = "single" model = AgglomerativeClustering(compute_distances=True, metric=metric, linkage=linkage) model = model.fit(distance_matrix) print("Labels:", model.labels_) print("Number of clusters:", len(np.unique(model.labels_))) # Визуализация дендрограммы plt.figure(figsize=(12, 6)) plt.title('Hierarchical Clustering Dendrogram - MLB Data \n metric="{}", linkage="{}"'.format(metric, linkage)) # plot the top three levels of the dendrogram plot_dendrogram(model, truncate_mode="level", p=3) plt.xlabel("Number of points in node") plt.ylabel("Distance") plt.show() # 3D визуализация кластеров fig1 = plt.figure(figsize=(12, 8)) ax = plt.axes(projection='3d') scatter = ax.scatter3D(mlb_pd['Height (inches)'], mlb_pd['Weight (pounds)'], mlb_pd['Age (years)'], c=model.labels_, cmap='tab10', s=50, alpha=0.7) ax.set_title('Agglomerative Clustering - MLB Players \n metric="{}", linkage="{}"'.format(metric, linkage)) ax.set_xlabel('Height (inches)') ax.set_ylabel('Weight (pounds)') ax.set_zlabel('Age (years)') # Добавляем цветовую легенду plt.colorbar(scatter, ax=ax, label='Cluster') plt.show() # Дополнительная информация о кластерах print("\nИнформация о кластерах:") for cluster_id in np.unique(model.labels_): cluster_data = mlb_pd[model.labels_ == cluster_id] print(f"Cluster {cluster_id}: {len(cluster_data)} players") print(f" Средний рост: {cluster_data['Height (inches)'].mean():.1f} inches") print(f" Средний вес: {cluster_data['Weight (pounds)'].mean():.1f} pounds") print(f" Средний возраст: {cluster_data['Age (years)'].mean():.1f} years") print() print("Распределение позиций по кластерам:") mlb_data_with_clusters = mlb_data.copy() mlb_data_with_clusters['Cluster'] = model.labels_