Labs/ИАД/lr1/task2_HW.py

import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from mpl_toolkits.mplot3d import Axes3D

# Датасет "Рост-Вес-Возраст-Позиция"
columns = ['Position', 'Height(inches)', 'Weight(pounds)', 'Age']

df = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t', nrows = 115, usecols = columns)

# Пусть Weight(pounds) - выходные переменные, а Height(inches) - входные переменные, так как они коррелируют сильнее остальных комбинаций
fig1, axes = plt.subplots(1, 3, figsize=(15, 5))
fig1.suptitle('Парная регрессия')

axes[0].scatter(df['Height(inches)'], df['Weight(pounds)'])
axes[0].set_xlabel('Height(inches)')
axes[0].set_ylabel('Weight(pounds)')
axes[0].set_title('График регрессии')


# Парная регрессия
N = 115
lr = LinearRegression().fit(df['Height(inches)'].values.reshape(-1, 1), df['Weight(pounds)'])

#line_height = np.linspace(min(df['Height(inches)']), max(df['Height(inches)']),115)
line_weight1 = lr.predict(df['Height(inches)'].values.reshape(-1, 1))
axes[0].plot(df['Height(inches)'], line_weight1, color='red', linewidth=2)

Qreg = np.sum((line_weight1 - df['Weight(pounds)'].mean())**2)
Qres = np.sum((df['Weight(pounds)'] - line_weight1)**2)
Qtotal = np.sum((df['Weight(pounds)'] - df['Weight(pounds)'].mean())**2)

res = df['Weight(pounds)'] - line_weight1

axes[1].scatter(line_weight1, res)
axes[1].axhline(y=0, color='red', linestyle='--')
axes[1].set_xlabel('Предсказанные значения')
axes[1].set_ylabel('Остаточные ошибки')
axes[1].set_title('Анализ остатков')

axes[2].hist(res)
axes[2].set_xlabel('Остаточные ошибки')
axes[2].set_ylabel('Количество точек')
axes[2].set_title('Гистограмма остатков')

#print(np.dot(df['Weight(pounds)'] - line_weight1,line_weight1))

#print(Qtotal, Qreg, Qres)
#print(Qreg/Qtotal, 1 - Qres/Qtotal)

print('Парная регрессия. Коэффициент детерминации: ', r2_score(df['Weight(pounds)'],line_weight1))
print('Скорр. коэффициент детерминации: ', 1 - (1 - r2_score(df['Weight(pounds)'],line_weight1)*(115/113)))
print('Множ. коэффициент корреляции: ', np.sqrt(r2_score(df['Weight(pounds)'],line_weight1)))
print('Стандартная ошибка регрессии: ', np.sqrt(Qres/113))

# Множественная регрессия
K = 4
X = np.vstack((df['Position'], df['Height(inches)'], df['Age'])).T
mr = LinearRegression().fit(X, df['Weight(pounds)'])
line_weight2 = mr.predict(X)

fig2, axes = plt.subplots(1, 2, figsize=(10, 5))
fig2.suptitle('Множественная регрессия')
#print(line_weight2)
#axes[0].scatter(df['Height(inches)'], line_weight2, color='red', linewidth=2)

Qreg = np.sum((line_weight2 - df['Weight(pounds)'].mean())**2)
Qres = np.sum((df['Weight(pounds)'] - line_weight2)**2)

res = df['Weight(pounds)'] - line_weight2

axes[0].scatter(line_weight2, res)
axes[0].axhline(y=0, color='red', linestyle='--')
axes[0].set_xlabel('Предсказанные значения')
axes[0].set_ylabel('Остаточные ошибки')
axes[0].set_title('Анализ остатков')

axes[1].hist(res)
axes[1].set_xlabel('Остаточные ошибки')
axes[1].set_ylabel('Количество точек')
axes[1].set_title('Гистограмма остатков')

print('Парная регрессия. Коэффициент детерминации: ', r2_score(df['Weight(pounds)'],line_weight2))
print('Скорр. коэффициент детерминации: ', 1 - (1 - r2_score(df['Weight(pounds)'],line_weight2)*(115/113)))
print('Множ. коэффициент корреляции: ', np.sqrt(r2_score(df['Weight(pounds)'],line_weight2)))
print('Стандартная ошибка регрессии: ', np.sqrt(Qres/113))

plt.show()