23:10 добавлена часть ЛР1

3 месяцев назад · 295c3ccb09
--- a/ИАД/lr1/task1_Fisher.py
+++ b/ИАД/lr1/task1_Fisher.py
@ -0,0 +1,65 @@
+import numpy as np
+from sklearn.datasets import load_iris
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sb
+import pingouin as pg
+from scipy import stats
+
+# Датасет Ирисы Фишера
+iris = load_iris()
+#iris_pd=pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])
+iris_pd=pd.DataFrame(data=np.c_[iris['data']], columns=iris['feature_names'])
+
+# Формирование выборок по признакам
+x1 = np.array(iris_pd['sepal length (cm)']) # первый признак - длина чашелистика
+x2 = np.array(iris_pd['sepal width (cm)']) # второй признак - ширина чашелистика
+x3 = np.array(iris_pd['petal length (cm)']) # третий признак - длина лепестка
+x4 = np.array(iris_pd['petal width (cm)']) # четвертый признак - ширина лепестка
+
+# Проверка на нормальность по критерию К-С
+print('Тест Колмогорова-Смирнова. Уровень значимости a = 0.05')
+
+for col in iris['feature_names']:
+    x = iris_pd[col]
+    x_mean, std = np.mean(x), np.std(x, ddof = 1)
+    x_std = (x - x_mean)/std
+    pvalue = stats.kstest(x_std, "norm", alternative='less').pvalue
+    if pvalue > 0.05: result = 'H0 не должна быть отвергнута'
+    else: result = 'H0 должна быть отвергнута'
+    print('Для {}: p-value: {}, так что {}'.format(col, pvalue, result))
+
+# Построение диаграммы Тьюки
+fig1, axes = plt.subplots(1, 3, figsize=(20, 4))  
+axes[0].boxplot([x1,x2,x3,x4], tick_labels = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width'])
+axes[0].set_title('Диаграмма Тьюки')
+
+
+# Построение тепловой карты парных коэффициентов корреляции
+corrMatrix = iris_pd.corr()
+axes[1].set_title('Тепловая карта парных коэфф. корреляции')
+sb.heatmap(corrMatrix, annot=True, linewidths=0.5, xticklabels = False, yticklabels = False, ax = axes[1])
+
+
+# Построение тепловой карты частных коэффициентов корреляции
+pCorrMatrix = iris_pd.pcorr()
+axes[2].set_title('Тепловая карта частных коэфф. корреляции')
+sb.heatmap(pCorrMatrix, annot=True, linewidths=0.5, xticklabels = False, yticklabels = False, ax = axes[2])
+
+
+# Построение гистограмм по выборкам
+fig2, axes = plt.subplots(2, 2, figsize=(10, 7))
+
+axes[0,0].hist(x1)
+axes[0,0].set_title('Гисотграмма по значениям X1')
+
+axes[0,1].hist(x2)
+axes[0,1].set_title('Гисотграмма по значениям X2')
+
+axes[1,0].hist(x3)
+axes[1,0].set_title('Гисотграмма по значениям X3')
+
+axes[1,1].hist(x4)
+axes[1,1].set_title('Гисотграмма по значениям X4')
+
+plt.show()
--- a/ИАД/lr1/task1_HW.py
+++ b/ИАД/lr1/task1_HW.py
@ -0,0 +1,73 @@
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sb
+import pingouin as pg
+from scipy import stats
+
+# Датасет "Рост-Вес-Возраст-Позиция"
+columns = ['Position', 'Height(inches)', 'Weight(pounds)', 'Age']
+
+df = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t', nrows = 115, usecols = columns)
+
+#replace_dict = {'Catcher':1, 'First_Baseman':2, 'Second_Baseman':3, 'Shortstop':4, 'Third_Baseman':5,
+# 'Outfielder':6, 'Designated_Hitter':7, 'Starting_Pitcher':8, 'Relief_Pitcher':9}
+
+#pd.set_option('future.no_silent_downcasting', True)
+#df['Position'] = df['Position'].replace(replace_dict)
+#Index(['Name', 'Team', 'Position', 'Height(inches)', 'Weight(pounds)', 'Age'], dtype='object')
+
+
+
+# Построение диаграммы Тьюки
+fig1, axes = plt.subplots(2, 2, zfigsize=(10, 10))  
+fig1.suptitle('Диаграммы Тюки по признакам')
+
+axes[0,0].boxplot(df['Position'], tick_labels = ['Position'])
+axes[0,1].boxplot(df['Height(inches)'], tick_labels = ['Height(inches)'])
+axes[1,0].boxplot(df['Weight(pounds)'], tick_labels = ['Weight(pounds)'])
+axes[1,1].boxplot(df['Age'], tick_labels = ['Age'])
+
+fig2, axes = plt.subplots(1, 2, figsize=(12, 5))  
+fig2.suptitle('Тепловые карты')
+
+# Построение тепловой карты парных коэффициентов корреляции
+corrMatrix = df.corr()
+axes[0].set_title('Тепловая карта парных коэфф. корреляции')
+sb.heatmap(corrMatrix, annot=True, linewidths=0.5, xticklabels = False, yticklabels = False, ax = axes[0])
+
+
+# Построение тепловой карты частных коэффициентов корреляции
+pCorrMatrix = df.pcorr()
+axes[1].set_title('Тепловая карта частных коэфф. корреляции')
+sb.heatmap(pCorrMatrix, annot=True, linewidths=0.5, xticklabels = False, yticklabels = False, ax = axes[1])
+
+
+
+print('Тест Колмогорова-Смирнова. Уровень значимости a = 0.05')
+
+for col in columns:
+    x = df[col]
+    x_mean, std = np.mean(x), np.std(x, ddof = 1)
+    x_std = (x - x_mean)/std
+    pvalue = stats.kstest(x_std, "norm", alternative='less').pvalue
+    if pvalue > 0.05: result = 'H0 не должна быть отвергнута'
+    else: result = 'H0 должна быть отвергнута'
+    print('Для {}: p-value: {}, так что {}'.format(col, pvalue, result))
+    
+# Построение гистограмм по выборкам
+fig2, axes = plt.subplots(2, 2, figsize=(10, 7))
+
+axes[0,0].hist(df['Position'])
+axes[0,0].set_title('Гисотграмма по значениям Position')
+
+axes[0,1].hist(df['Height(inches)'])
+axes[0,1].set_title('Гисотграмма по значениям Height(inches)')
+
+axes[1,0].hist(df['Weight(pounds)'])
+axes[1,0].set_title('Гисотграмма по значениям Weight(pounds)')
+
+axes[1,1].hist(df['Age'])
+axes[1,1].set_title('Гисотграмма по значениям Age')
+
+plt.show()
--- a/ИАД/lr1/task2_Fisher.py
+++ b/ИАД/lr1/task2_Fisher.py
@ -0,0 +1,102 @@
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import r2_score
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sb
+import pingouin as pg
+from scipy import stats
+
+# Датасет Ирисы Фишера
+iris = load_iris()
+iris_pd=pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])
+
+# Формирование выборок по признакам
+x1 = np.array(iris_pd['sepal length (cm)']) # первый признак - длина чашелистика
+x2 = np.array(iris_pd['sepal width (cm)']) # второй признак - ширина чашелистика
+x3 = np.array(iris_pd['petal length (cm)']) # третий признак - длина лепестка
+x4 = np.array(iris_pd['petal width (cm)']) # четвертый признак - ширина лепестка
+
+# Пусть X4 - выходные переменные, а X3 - входные переменные, так как они коррелируют сильнее остальных комбинаций
+fig, axes = plt.subplots(2, 2, figsize=(10, 10)) 
+
+axes[0,0].scatter(x3, x4)
+axes[0,0].set_title('Зависимость X4 от X3')
+
+axes[0,1].scatter(x3, x4)
+axes[0,1].set_title('Предсказанные значения')
+
+
+
+# Парная регрессия
+N = 150
+K = 2
+lr = LinearRegression().fit(x3.reshape(-1,1), x4)
+
+#line_x3 = np.linspace(min(x3), max(x3),150)
+line_y1 = lr.predict(x3.reshape(-1,1))
+axes[0,0].plot(x3, line_y1, color='red', linewidth=2)
+
+line_x3_pred = np.linspace(min(x3), 2*max(x3), 150)
+line_y1_pred = lr.predict(line_x3_pred.reshape(-1,1))
+axes[0,1].plot(line_x3_pred, line_y1_pred, color='red', linewidth=2)
+
+meanY = np.mean(x4)
+Qreg = np.sum((line_y1 - meanY)**2)
+Qtotal = np.sum((x4 - meanY)**2)
+Qint = np.sum((x4 - line_y1)**2)
+Sreg = (1/(K-1))*Qreg
+Stotal = (1/(N-1))*Qtotal
+Sint = (1/(N-K))*Qint
+
+print(np.dot(x4 - line_y1, line_y1))
+
+print('Парная регрессия. Коэффициент детерминации: ', r2_score(x4,line_y1))
+print('Скорр. коэффициент детерминации: ', 1 - (1 - r2_score(x4,line_y1)*(149/147)))
+print('Множ. коэффициент корреляции: ', np.sqrt(r2_score(x4,line_y1)))
+print('Стандартная ошибка регрессии: ', np.sqrt(Qint/147))
+
+# Множественная регрессия
+N = 150
+K = 4
+X = np.vstack((x1, x2, x3)).T
+mr = LinearRegression().fit(X, x4)
+
+line_x1 = np.linspace(min(x1), max(x1), 150)
+line_x2 = np.linspace(min(x2), max(x2), 150)
+line_x3 = np.linspace(min(x3), max(x3), 150)
+
+line_y2 = mr.coef_[0] * line_x1 + mr.coef_[1] * line_x2 + mr.coef_[2] * line_x3 + mr.intercept_
+axes[0,0].plot(line_x3, line_y2, color='green', linewidth=2)
+
+line_x1_pred = np.linspace(min(x1), 2*max(x1), 150)
+line_x2_pred = np.linspace(min(x2), 2*max(x2), 150)
+line_x3_pred = np.linspace(min(x3), 2*max(x3), 150)
+line_y2_pred = mr.coef_[0] * line_x1_pred + mr.coef_[1] * line_x2_pred + mr.coef_[2] * line_x3_pred + mr.intercept_
+axes[0,1].plot(line_x3_pred, line_y2_pred, color='green', linewidth=2)
+
+Qreg = np.sum((line_y2 - meanY)**2)
+Qint = np.sum((x4 - line_y2)**2)
+Sreg = (1/(K-1))*Qreg
+Stotal = (1/(N-1))*Qtotal
+Sint = (1/(N-K))*Qint
+
+print('Множ. регресиия. Коэффициент детерминации: ', r2_score(x4,line_y2))
+print('Скорр. коэффициент детерминации: ', 1 - (1 - r2_score(x4,line_y2)*(149/147)))
+print('Множ. коэффициент корреляции: ', np.sqrt(r2_score(x4,line_y2)))
+print('Стандартная ошибка регрессии: ', np.sqrt(Qint/147))
+
+# График остатков
+residuals1 = x4 - line_y1
+residuals2 = x4 - line_y2
+
+axes[1,0].scatter([i for i in range(1,151)], residuals1)
+axes[1,0].set_title('Парная регрессия. График остатков')
+
+axes[1,1].scatter([i for i in range(1,151)], residuals2)
+axes[1,1].set_title('Множ. регрессия. График остатков')
+
+
+plt.show()
+
--- a/ИАД/lr1/task2_HW.py
+++ b/ИАД/lr1/task2_HW.py
@ -0,0 +1,95 @@
+import numpy as np
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import r2_score
+import pandas as pd
+import matplotlib.pyplot as plt
+from scipy import stats
+from mpl_toolkits.mplot3d import Axes3D
+
+# Датасет "Рост-Вес-Возраст-Позиция"
+columns = ['Position', 'Height(inches)', 'Weight(pounds)', 'Age']
+
+df = pd.read_csv('SOCR_Data_MLB_HeightsWeights.txt', sep='\t', nrows = 115, usecols = columns)
+
+# Пусть Weight(pounds) - выходные переменные, а Height(inches) - входные переменные, так как они коррелируют сильнее остальных комбинаций
+fig1, axes = plt.subplots(1, 3, figsize=(15, 5)) 
+fig1.suptitle('Парная регрессия')
+
+axes[0].scatter(df['Height(inches)'], df['Weight(pounds)'])
+axes[0].set_xlabel('Height(inches)')
+axes[0].set_ylabel('Weight(pounds)')
+axes[0].set_title('График регрессии')
+
+
+
+
+# Парная регрессия
+N = 115
+lr = LinearRegression().fit(df['Height(inches)'].values.reshape(-1, 1), df['Weight(pounds)'])   
+
+#line_height = np.linspace(min(df['Height(inches)']), max(df['Height(inches)']),115)
+line_weight1 = lr.predict(df['Height(inches)'].values.reshape(-1, 1))
+axes[0].plot(df['Height(inches)'], line_weight1, color='red', linewidth=2)
+
+Qreg = np.sum((line_weight1 - df['Weight(pounds)'].mean())**2)
+Qres = np.sum((df['Weight(pounds)'] - line_weight1)**2)
+Qtotal = np.sum((df['Weight(pounds)'] - df['Weight(pounds)'].mean())**2)
+
+res = df['Weight(pounds)'] - line_weight1
+
+axes[1].scatter(line_weight1, res)
+axes[1].axhline(y=0, color='red', linestyle='--')
+axes[1].set_xlabel('Предсказанные значения')
+axes[1].set_ylabel('Остаточные ошибки')
+axes[1].set_title('Анализ остатков')
+
+axes[2].hist(res)
+axes[2].set_xlabel('Остаточные ошибки')
+axes[2].set_ylabel('Количество точек')
+axes[2].set_title('Гистограмма остатков')
+
+#print(np.dot(df['Weight(pounds)'] - line_weight1,line_weight1))
+
+#print(Qtotal, Qreg, Qres)
+#print(Qreg/Qtotal, 1 - Qres/Qtotal)
+
+print('Парная регрессия. Коэффициент детерминации: ', r2_score(df['Weight(pounds)'],line_weight1))
+print('Скорр. коэффициент детерминации: ', 1 - (1 - r2_score(df['Weight(pounds)'],line_weight1)*(115/113)))
+print('Множ. коэффициент корреляции: ', np.sqrt(r2_score(df['Weight(pounds)'],line_weight1)))
+print('Стандартная ошибка регрессии: ', np.sqrt(Qres/113))
+
+# Множественная регрессия
+K = 4
+X = np.vstack((df['Position'], df['Height(inches)'], df['Age'])).T
+mr = LinearRegression().fit(X, df['Weight(pounds)'])
+line_weight2 = mr.predict(X)
+
+fig2, axes = plt.subplots(1, 2, figsize=(10, 5)) 
+fig2.suptitle('Множественная регрессия')
+#print(line_weight2)
+#axes[0].scatter(df['Height(inches)'], line_weight2, color='red', linewidth=2)
+
+Qreg = np.sum((line_weight2 - df['Weight(pounds)'].mean())**2)
+Qres = np.sum((df['Weight(pounds)'] - line_weight2)**2)
+
+res = df['Weight(pounds)'] - line_weight2
+
+axes[0].scatter(line_weight2, res)
+axes[0].axhline(y=0, color='red', linestyle='--')
+axes[0].set_xlabel('Предсказанные значения')
+axes[0].set_ylabel('Остаточные ошибки')
+axes[0].set_title('Анализ остатков')
+
+axes[1].hist(res)
+axes[1].set_xlabel('Остаточные ошибки')
+axes[1].set_ylabel('Количество точек')
+axes[1].set_title('Гистограмма остатков')
+
+print('Парная регрессия. Коэффициент детерминации: ', r2_score(df['Weight(pounds)'],line_weight2))
+print('Скорр. коэффициент детерминации: ', 1 - (1 - r2_score(df['Weight(pounds)'],line_weight2)*(115/113)))
+print('Множ. коэффициент корреляции: ', np.sqrt(r2_score(df['Weight(pounds)'],line_weight2)))
+print('Стандартная ошибка регрессии: ', np.sqrt(Qres/113))
+
+plt.show()
+
+
--- a/ИАД/lr1/task3_Fisher.py
+++ b/ИАД/lr1/task3_Fisher.py
@ -0,0 +1,105 @@
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import r2_score
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sb
+import pingouin as pg
+from scipy import stats
+
+# Датасет Ирисы Фишера
+iris = load_iris()
+iris_pd=pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])
+
+# Формирование выборок по признакам
+x1 = np.array(iris_pd['sepal length (cm)']) # первый признак - длина чашелистика
+x2 = np.array(iris_pd['sepal width (cm)']) # второй признак - ширина чашелистика
+x3 = np.array(iris_pd['petal length (cm)']) # третий признак - длина лепестка
+x4 = np.array(iris_pd['petal width (cm)']) # четвертый признак - ширина лепестка
+
+# Пусть X4 - выходные переменные, а X3 - входные переменные, так как они коррелируют сильнее остальных комбинаций
+fig, axes = plt.subplots(2, 2, figsize=(10, 10)) 
+
+
+# Парная регрессия
+N = 150
+K = 2
+lr = LinearRegression().fit(x3.reshape(-1,1), x4)
+
+line_x3 = np.linspace(min(x3), max(x3),150)
+line_y1 = lr.predict(line_x3.reshape(-1,1))
+
+line_x3_pred = np.linspace(min(x3), 2*max(x3), 150)
+line_y1_pred = lr.predict(line_x3_pred.reshape(-1,1))
+
+meanY = np.mean(x4)
+Qreg = np.sum((line_y1 - meanY)**2)
+Qtotal = np.sum((x4 - meanY)**2)
+Qint = np.sum((x4 - line_y1)**2)
+Sreg = (1/(K-1))*Qreg
+Stotal = (1/(N-1))*Qtotal
+Sint = (1/(N-K))*Qint
+
+
+# Множественная регрессия
+N = 150
+K = 4
+X = np.vstack((x1, x2, x3)).T
+mr = LinearRegression().fit(X, x4)
+
+line_x1 = np.linspace(min(x1), max(x1), 150)
+line_x2 = np.linspace(min(x2), max(x2), 150)
+line_x3 = np.linspace(min(x3), max(x3), 150)
+
+line_y2 = mr.coef_[0] * line_x1 + mr.coef_[1] * line_x2 + mr.coef_[2] * line_x3 + mr.intercept_
+
+line_x1_pred = np.linspace(min(x1), 2*max(x1), 150)
+line_x2_pred = np.linspace(min(x2), 2*max(x2), 150)
+line_x3_pred = np.linspace(min(x3), 2*max(x3), 150)
+line_y2_pred = mr.coef_[0] * line_x1_pred + mr.coef_[1] * line_x2_pred + mr.coef_[2] * line_x3_pred + mr.intercept_
+
+Qreg = np.sum((line_y2 - meanY)**2)
+Qint = np.sum((x4 - line_y2)**2)
+Sreg = (1/(K-1))*Qreg
+Stotal = (1/(N-1))*Qtotal
+Sint = (1/(N-K))*Qint
+
+
+# График остатков
+residuals1 = x4 - line_y1
+residuals2 = x4 - line_y2
+
+axes[0,0].hist(residuals1)
+axes[0,1].hist(residuals2)
+
+
+plt.show()
+
+# Проверка остатков на нормальность по критерию К-С
+print('Тест Колмогорова-Смирнова. Уровень значимости a = 0.05')
+
+for x in [residuals1,residuals2]:
+    x_mean, std = np.mean(x), np.std(x, ddof = 1)
+    x_std = (x - x_mean)/std
+    pvalue = stats.kstest(x_std, "norm", alternative='less').pvalue
+    if pvalue > 0.05: result = 'H0 не должна быть отвергнута'
+    else: result = 'H0 должна быть отвергнута'
+    print('p-value: {}, так что {}'.format(pvalue, result))
+    
+# Статистика Дурбина-Уотсона
+# для парной регрессии
+a1 = 0
+gl = 1.706
+gu = 1.760
+for i in range(N-1): a1 += (residuals1[i] - residuals1[i+1])**2
+gamma = a1/sum(residuals1**2)
+print(gamma, gl, gu)
+
+# для множ. регрессии
+a2 = 0
+gl = 1.679
+gu = 1.788
+for i in range(N-1): a2 += (residuals2[i] - residuals2[i+1])**2
+gamma = a2/sum(residuals2**2)
+print(gamma, gl, gu)
--- a/ИАД/lr1/task5_Fisher.py
+++ b/ИАД/lr1/task5_Fisher.py
@ -0,0 +1,45 @@
+import numpy as np
+from sklearn.datasets import load_iris
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.model_selection import train_test_split
+import pandas as pd
+import matplotlib.pyplot as plt
+
+# Датасет Ирисы Фишера
+iris = load_iris()
+iris_pd=pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['target'])
+
+# Формирование выборок по признакам
+x1 = np.array(iris_pd['sepal length (cm)']) # первый признак - длина чашелистика
+x2 = np.array(iris_pd['sepal width (cm)']) # второй признак - ширина чашелистика
+x3 = np.array(iris_pd['petal length (cm)']) # третий признак - длина лепестка
+x4 = np.array(iris_pd['petal width (cm)']) # четвертый признак - ширина лепестка
+
+X = np.column_stack((x1, x2, x3))
+
+# Разделение на обучающую и тестовую выборки
+X_train, X_test, y_train, y_test = train_test_split(X, x4, test_size=0.25, random_state=42)
+
+
+
+model = LinearRegression()
+model.fit(X_train, y_train)
+
+y_pred = model.predict(X_test)
+
+mse = mean_squared_error(y_test, y_pred)
+rmse = np.sqrt(mse)
+r2 = r2_score(y_test, y_pred)
+
+print(f"Среднеквадратичное отклонение (СКO): {rmse:.2f}")
+print(f"Коэффициент детерминации (R²): {r2:.2f}")
+
+residuals = y_test - y_pred
+
+plt.scatter(y_pred, residuals)
+plt.axhline(y=0, color='red', linestyle='--')
+plt.xlabel('Предсказанные значения')
+plt.ylabel('Остаточные ошибки')
+plt.title('График остатков')
+plt.show()
--- a/ИАД/lr1/Крит_знач_ДурбинВатсон.jpg
+++ b/ИАД/lr1/Крит_знач_ДурбинВатсон.jpg