Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
243 строки
9.1 KiB
Python
243 строки
9.1 KiB
Python
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.preprocessing import StandardScaler
|
|
|
|
# Загрузка датасета MLB
|
|
mlb_data = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t')
|
|
mlb_data.columns = ['Name', 'Team', 'Position', 'Height', 'Weight', 'Age']
|
|
|
|
# Создаем DataFrame
|
|
mlb_pd = pd.DataFrame(data=np.c_[mlb_data[['Height', 'Weight', 'Age']]],
|
|
columns=['Height (inches)', 'Weight (pounds)', 'Age (years)'])
|
|
|
|
# Стандартизация данных
|
|
scaler = StandardScaler()
|
|
data_scaled = scaler.fit_transform(mlb_pd[['Height (inches)', 'Weight (pounds)', 'Age (years)']])
|
|
|
|
# Значения K для кластеризации
|
|
k_values = [2, 3, 4]
|
|
|
|
print("=" * 60)
|
|
print("НЕИЕРАРХИЧЕСКАЯ КЛАСТЕРИЗАЦИЯ - МЕТОД K-СРЕДНИХ")
|
|
print("=" * 60)
|
|
|
|
for k in k_values:
|
|
# Применяем K-means
|
|
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
|
labels = kmeans.fit_predict(data_scaled)
|
|
|
|
# Создаем отдельную 3D визуализацию для каждого K
|
|
fig = plt.figure(figsize=(12, 8))
|
|
ax = fig.add_subplot(111, projection='3d')
|
|
|
|
# Цвета для кластеров
|
|
colors = ['red', 'blue', 'green', 'orange', 'purple']
|
|
|
|
# Визуализация каждого кластера
|
|
for cluster_id in range(k):
|
|
cluster_data = mlb_pd[labels == cluster_id]
|
|
ax.scatter3D(cluster_data['Height (inches)'],
|
|
cluster_data['Weight (pounds)'],
|
|
cluster_data['Age (years)'],
|
|
c=colors[cluster_id],
|
|
label=f'Cluster {cluster_id}',
|
|
s=50,
|
|
alpha=0.7,
|
|
edgecolors='black',
|
|
linewidth=0.5)
|
|
|
|
# Добавляем центроиды
|
|
centers_original = scaler.inverse_transform(kmeans.cluster_centers_)
|
|
ax.scatter3D(centers_original[:, 0],
|
|
centers_original[:, 1],
|
|
centers_original[:, 2],
|
|
c='black',
|
|
marker='X',
|
|
s=200,
|
|
label='Centroids',
|
|
edgecolors='white',
|
|
linewidth=2)
|
|
|
|
ax.set_title(f'K-means Clustering\nK = {k} clusters', fontsize=16, pad=20)
|
|
ax.set_xlabel('Height (inches)', fontsize=12, labelpad=10)
|
|
ax.set_ylabel('Weight (pounds)', fontsize=12, labelpad=10)
|
|
ax.set_zlabel('Age (years)', fontsize=12, labelpad=10)
|
|
|
|
ax.legend()
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
# Анализ кластеров
|
|
print(f"\nK = {k}:")
|
|
mlb_pd_temp = mlb_pd.copy()
|
|
mlb_pd_temp['Cluster'] = labels
|
|
|
|
for cluster_id in range(k):
|
|
cluster_data = mlb_pd_temp[mlb_pd_temp['Cluster'] == cluster_id]
|
|
center = centers_original[cluster_id]
|
|
|
|
print(f" Cluster {cluster_id} ({len(cluster_data)} players):")
|
|
print(f" Center: Height={center[0]:.1f}\", Weight={center[1]:.1f}lbs, Age={center[2]:.1f}yr")
|
|
print(
|
|
f" Stats: Height={cluster_data['Height (inches)'].mean():.1f}±{cluster_data['Height (inches)'].std():.1f}\"")
|
|
print(
|
|
f" Weight={cluster_data['Weight (pounds)'].mean():.1f}±{cluster_data['Weight (pounds)'].std():.1f}lbs")
|
|
print(f" Age={cluster_data['Age (years)'].mean():.1f}±{cluster_data['Age (years)'].std():.1f}yr")
|
|
|
|
# СРАВНИТЕЛЬНЫЙ АНАЛИЗ КАЧЕСТВА КЛАСТЕРИЗАЦИИ
|
|
print("\n" + "=" * 60)
|
|
print("СРАВНИТЕЛЬНЫЙ АНАЛИЗ КАЧЕСТВА КЛАСТЕРИЗАЦИИ")
|
|
print("=" * 60)
|
|
|
|
from sklearn.metrics import silhouette_score, calinski_harabasz_score
|
|
|
|
results = []
|
|
for k in k_values:
|
|
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
|
|
labels = kmeans.fit_predict(data_scaled)
|
|
|
|
sil_score = silhouette_score(data_scaled, labels)
|
|
ch_score = calinski_harabasz_score(data_scaled, labels)
|
|
|
|
results.append({
|
|
'K': k,
|
|
'Silhouette': sil_score,
|
|
'Calinski-Harabasz': ch_score,
|
|
'Inertia': kmeans.inertia_
|
|
})
|
|
|
|
print(f"K = {k}:")
|
|
print(f" Silhouette Score: {sil_score:.3f}")
|
|
print(f" Calinski-Harabasz Score: {ch_score:.1f}")
|
|
print(f" Within-Cluster Sum of Squares: {kmeans.inertia_:.1f}")
|
|
|
|
# ВИЗУАЛИЗАЦИЯ С РАЗНЫМИ УГЛАМИ ОБЗОРА
|
|
print("\n" + "=" * 60)
|
|
print("3D ВИЗУАЛИЗАЦИЯ С РАЗНЫМИ УГЛАМИ ОБЗОРА")
|
|
print("=" * 60)
|
|
|
|
# Выбираем оптимальное K (обычно 3 для MLB данных)
|
|
optimal_k = 3
|
|
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
|
|
labels_optimal = kmeans_optimal.fit_predict(data_scaled)
|
|
centers_optimal = scaler.inverse_transform(kmeans_optimal.cluster_centers_)
|
|
|
|
# Разные углы обзора
|
|
view_angles = [
|
|
(30, 45), # стандартный вид
|
|
(0, 0), # вид сверху
|
|
(90, 0), # вид сбоку
|
|
(30, -45) # вид с другой стороны
|
|
]
|
|
|
|
for i, (elev, azim) in enumerate(view_angles):
|
|
fig = plt.figure(figsize=(10, 8))
|
|
ax = fig.add_subplot(111, projection='3d')
|
|
|
|
colors = ['red', 'blue', 'green']
|
|
|
|
for cluster_id in range(optimal_k):
|
|
cluster_data = mlb_pd[labels_optimal == cluster_id]
|
|
ax.scatter3D(cluster_data['Height (inches)'],
|
|
cluster_data['Weight (pounds)'],
|
|
cluster_data['Age (years)'],
|
|
c=colors[cluster_id],
|
|
label=f'Cluster {cluster_id}',
|
|
s=50,
|
|
alpha=0.7)
|
|
|
|
# Центроиды
|
|
ax.scatter3D(centers_optimal[:, 0],
|
|
centers_optimal[:, 1],
|
|
centers_optimal[:, 2],
|
|
c='black',
|
|
marker='X',
|
|
s=200,
|
|
label='Centroids')
|
|
|
|
ax.view_init(elev=elev, azim=azim)
|
|
ax.set_title(f'K-means (K=3) - View {i + 1}\nElevation: {elev}°, Azimuth: {azim}°', fontsize=14)
|
|
ax.set_xlabel('Height (inches)')
|
|
ax.set_ylabel('Weight (pounds)')
|
|
ax.set_zlabel('Age (years)')
|
|
ax.legend()
|
|
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
# ФИНАЛЬНАЯ ВИЗУАЛИЗАЦИЯ С АНАЛИЗОМ ПОЗИЦИЙ
|
|
print("\n" + "=" * 60)
|
|
print("ФИНАЛЬНЫЙ АНАЛИЗ - РАСПРЕДЕЛЕНИЕ ПОЗИЦИЙ ПО КЛАСТЕРАМ")
|
|
print("=" * 60)
|
|
|
|
# Анализ для K=3 (оптимальное значение)
|
|
mlb_data_with_clusters = mlb_data.copy()
|
|
mlb_data_with_clusters['Cluster'] = labels_optimal
|
|
|
|
|
|
# Группируем позиции по основным категориям
|
|
def categorize_position(pos):
|
|
if 'Pitcher' in pos:
|
|
return 'Pitcher'
|
|
elif 'Catcher' in pos:
|
|
return 'Catcher'
|
|
elif 'Baseman' in pos:
|
|
return 'Infielder'
|
|
elif 'Outfielder' in pos:
|
|
return 'Outfielder'
|
|
else:
|
|
return 'Other'
|
|
|
|
|
|
mlb_data_with_clusters['Position_Category'] = mlb_data_with_clusters['Position'].apply(categorize_position)
|
|
|
|
print("\nРаспределение позиций по кластерам (K=3):")
|
|
cluster_position = mlb_data_with_clusters.groupby(['Cluster', 'Position_Category']).size().unstack(fill_value=0)
|
|
print(cluster_position)
|
|
|
|
# Визуализация с интерпретацией кластеров
|
|
fig = plt.figure(figsize=(14, 10))
|
|
ax = fig.add_subplot(111, projection='3d')
|
|
|
|
colors = ['red', 'blue', 'green']
|
|
cluster_names = ['Low Height/Weight', 'Medium Build', 'High Height/Weight']
|
|
|
|
for cluster_id in range(optimal_k):
|
|
cluster_data = mlb_pd[labels_optimal == cluster_id]
|
|
ax.scatter3D(cluster_data['Height (inches)'],
|
|
cluster_data['Weight (pounds)'],
|
|
cluster_data['Age (years)'],
|
|
c=colors[cluster_id],
|
|
label=f'{cluster_names[cluster_id]} ({len(cluster_data)} players)',
|
|
s=60,
|
|
alpha=0.8)
|
|
|
|
# Центроиды
|
|
ax.scatter3D(centers_optimal[:, 0],
|
|
centers_optimal[:, 1],
|
|
centers_optimal[:, 2],
|
|
c='black',
|
|
marker='X',
|
|
s=300,
|
|
label='Centroids',
|
|
edgecolors='white',
|
|
linewidth=3)
|
|
|
|
ax.set_title('FINAL: K-means Clustering of MLB Players\nK = 3 (Optimal)', fontsize=16, pad=20)
|
|
ax.set_xlabel('Height (inches)\n← Shorter Taller →', fontsize=12, labelpad=15)
|
|
ax.set_ylabel('Weight (pounds)\n← Lighter Heavier →', fontsize=12, labelpad=15)
|
|
ax.set_zlabel('Age (years)\n← Younger Older →', fontsize=12, labelpad=15)
|
|
|
|
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
|
|
plt.tight_layout()
|
|
plt.show()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("ИТОГОВЫЕ ВЫВОДЫ")
|
|
print("=" * 60)
|
|
print("1. K=2: Два крупных кластера (общее разделение по физическим параметрам)")
|
|
print("2. K=3: Оптимальное разделение (низкие/средние/высокие игроки)")
|
|
print("3. K=4: Более детальное разделение, но может быть избыточным")
|
|
print("4. Рекомендуется K=3 для MLB данных") |