Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
43 строки
1.7 KiB
Python
43 строки
1.7 KiB
Python
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import pandas as pd
|
|
from sklearn.manifold import TSNE
|
|
from sklearn.cluster import KMeans
|
|
from sklearn.metrics import silhouette_score
|
|
|
|
# Загрузка датасета MLB
|
|
mlb_data = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t')
|
|
mlb_data.columns = ['Name', 'Team', 'Position', 'Height', 'Weight', 'Age']
|
|
|
|
mlb_pd = pd.DataFrame(data=np.c_[mlb_data[['Height', 'Weight', 'Age']]],
|
|
columns=['Height (inches)', 'Weight (pounds)', 'Age (years)'])
|
|
|
|
data = mlb_pd[['Height (inches)', 'Weight (pounds)', 'Age (years)']].to_numpy()
|
|
|
|
# Лучшие перплексии для анализа кластеров
|
|
best_perplexities = [10, 30, 60, 90]
|
|
|
|
for perplexity in best_perplexities:
|
|
# t-SNE
|
|
data_embedded = TSNE(n_components=2, learning_rate='auto', init='random',
|
|
perplexity=perplexity, random_state=42).fit_transform(data)
|
|
|
|
# Кластеризация K-means
|
|
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
|
|
labels = kmeans.fit_predict(data_embedded)
|
|
|
|
# Silhouette Score
|
|
sil_score = silhouette_score(data_embedded, labels)
|
|
|
|
# Визуализация
|
|
plt.figure(figsize=(8, 6))
|
|
scatter = plt.scatter(data_embedded[:, 0], data_embedded[:, 1],
|
|
c=labels, cmap='tab10', alpha=0.7, s=40)
|
|
plt.title(f't-SNE + K-means\nPerplexity = {perplexity}\nSilhouette: {sil_score:.3f}')
|
|
plt.xlabel('t-SNE Component 1')
|
|
plt.ylabel('t-SNE Component 2')
|
|
plt.colorbar(scatter, label='Cluster')
|
|
plt.grid(True, alpha=0.3)
|
|
plt.show()
|
|
|
|
print(f"Perplexity {perplexity}: {len(np.unique(labels))} кластеров, Silhouette = {sil_score:.3f}") |