From 2126cdee3da354451c56ea618c59e7d07527714d Mon Sep 17 00:00:00 2001 From: SidoraDA Date: Sun, 30 Nov 2025 12:06:49 +0000 Subject: [PATCH] =?UTF-8?q?=D0=97=D0=B0=D0=B3=D1=80=D1=83=D0=B7=D0=B8?= =?UTF-8?q?=D0=BB(=D0=B0)=20=D1=84=D0=B0=D0=B9=D0=BB=D1=8B=20=D0=B2=20'lab?= =?UTF-8?q?works/LW4'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- labworks/LW4/IS_LR4.ipynb | 399 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 399 insertions(+) create mode 100644 labworks/LW4/IS_LR4.ipynb diff --git a/labworks/LW4/IS_LR4.ipynb b/labworks/LW4/IS_LR4.ipynb new file mode 100644 index 0000000..29157ca --- /dev/null +++ b/labworks/LW4/IS_LR4.ipynb @@ -0,0 +1,399 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7kDlfTCMN-n2" + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "device_name=tf.test.gpu_device_name()\n", + "if device_name!='/device:GPU:0':\n", + " raise SystemError ('GPUdevicenotfound')\n", + "print('FoundGPUat:{}'.format(device_name))" + ] + }, + { + "cell_type": "code", + "source": [ + "#загрузка датасета\n", + "from keras.datasets import imdb\n", + "vocabulary_size=5000\n", + "index_from=3\n", + "(X_train,y_train),(X_test,y_test)=imdb.load_data(path=\"imdb.npz\",num_words=vocabulary_size,skip_top=0,maxlen=None,seed=15,start_char=1,oov_char=2,index_from=index_from)" + ], + "metadata": { + "id": "TJBFrj0mP_as" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Вывод размеров массивов данных\n", + "print(\"Размеры обучающих данных:\")\n", + "print(f\"X_train: {len(X_train)} \")\n", + "print(f\"y_train: {y_train.shape}\")\n", + "print(\"\\nРазмеры тестовых данных:\")\n", + "print(f\"X_test: {len(X_test)} \")\n", + "print(f\"y_test: {y_test.shape}\")" + ], + "metadata": { + "id": "wHyWnCbvWauD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Пункт 3: Создание словаря для перевода индексов в слова\n", + "# Загрузка словаря \"слово:индекс\"\n", + "word_to_id = imdb.get_word_index()\n", + "\n", + "# Уточнение словаря\n", + "word_to_id = {key: (value + index_from) for key, value in word_to_id.items()}\n", + "word_to_id[\"\"] = 0\n", + "word_to_id[\"\"] = 1\n", + "word_to_id[\"\"] = 2\n", + "word_to_id[\"\"] = 3" + ], + "metadata": { + "id": "5A3EcfboWtHs" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Создание обратного словаря \"индекс:слово\"\n", + "id_to_word = {value: key for key, value in word_to_id.items()}\n", + "\n", + "# Вывод одного отзыва из обучающего множества\n", + "import random\n", + "sample_index = random.randint(0, len(X_train)-1)\n", + "print(f\"\\nОтзыв №{sample_index}\")\n", + "print(\"Список индексов слов:\")\n", + "print(X_train[sample_index])" + ], + "metadata": { + "id": "QFWpDHnzWvVI" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Преобразование в текст\n", + "review_as_text = ' '.join(id_to_word.get(id, '') for id in X_train[sample_index])\n", + "print(\"\\nОтзыв в виде текста:\")\n", + "print(review_as_text)\n", + "\n", + "# Длина отзыва и метка класса\n", + "print(f\"\\nДлина отзыва: {len(X_train[sample_index])} слов\")\n", + "print(f\"Метка класса: {y_train[sample_index]} ({'Positive' if y_train[sample_index] == 1 else 'Negative'})\")" + ], + "metadata": { + "id": "gcM-G1ZDWxye" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Пункт 4: Максимальная и минимальная длина отзыва в обучающем множестве\n", + "lengths = [len(review) for review in X_train]\n", + "max_length = max(lengths)\n", + "min_length = min(lengths)\n", + "print(f\"Максимальная длина отзыва: {max_length} слов\")\n", + "print(f\"Минимальная длина отзыва: {min_length} слов\")" + ], + "metadata": { + "id": "FvosOeEyW3gD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Пункт 5: Предобработка данных\n", + "from tensorflow.keras.utils import pad_sequences\n", + "\n", + "max_words = 500 # Выбранная единая длина\n", + "\n", + "X_train = pad_sequences(\n", + " X_train,\n", + " maxlen=max_words,\n", + " value=0,\n", + " padding='pre',\n", + " truncating='post'\n", + ")" + ], + "metadata": { + "id": "O49mrPqEW6F6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "X_test = pad_sequences(\n", + " X_test,\n", + " maxlen=max_words,\n", + " value=0,\n", + " padding='pre',\n", + " truncating='post'\n", + ")" + ], + "metadata": { + "id": "XxoGW1oNW9Gu" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Пункт 6: Повторение п. 4 после предобработки\n", + "print(f\"Длина всех отзывов: {X_train.shape[1]} слов\")\n" + ], + "metadata": { + "id": "Zj4jmsjjW_OY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Пункт 7: Повторение п. 3 после предобработки\n", + "print(\"Список индексов слов:\")\n", + "print(X_train[sample_index])\n" + ], + "metadata": { + "id": "3bg2bfazXDQV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Преобразование в текст (игнорируем нулевые паддинги)\n", + "review_after_preprocessing = ' '.join(\n", + " id_to_word.get(id, '') for id in X_train[sample_index] if id != 0\n", + ")\n", + "print(\"\\nОтзыв в виде текста после предобработки:\")\n", + "print(review_after_preprocessing)\n", + "\n", + "print(f\"\\nДлина отзыва после предобработки: {len([id for id in X_train[sample_index] if id != 0])} значимых слов\")\n", + "print(f\"Общая длина с паддингом: {len(X_train[sample_index])}\")\n", + "\n", + "print(\"\\nВывод: После предобработки все отзывы приведены к единой длине 500 слов.\")\n", + "\n" + ], + "metadata": { + "id": "enEAfqWFXFYh" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Пункт 8: Вывод предобработанных массивов и их размерностей\n", + "print(\"Предобработанное обучающее множество X_train (первые 5 примеров):\")\n", + "print(X_train[:5])\n", + "print(\"\\nПредобработанное тестовое множество X_test (первые 5 примеров):\")\n", + "print(X_test[:5])\n", + "print(f\"Размерность X_train после предобработки: {X_train.shape}\")\n", + "print(f\"Размерность X_test после предобработки: {X_test.shape}\")\n", + "print(f\"Размерность y_train: {y_train.shape}\")\n", + "print(f\"Размерность y_test: {y_test.shape}\")" + ], + "metadata": { + "id": "XhHax3ytXNfn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\n", + "from tensorflow.keras.models import Sequential\n", + "from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense\n", + "import numpy as np\n", + "\n", + "model = Sequential()\n", + "\n", + "model.add(Embedding(\n", + " input_dim=vocabulary_size,\n", + " output_dim=32,\n", + " input_length=max_words\n", + "))\n", + "\n", + "\n", + "model.add(LSTM(units=100))\n", + "\n", + "\n", + "model.add(Dropout(rate=0.3))\n", + "\n", + "\n", + "model.add(Dense(1, activation='sigmoid')) #\n", + "\n", + "\n", + "model.build(input_shape=(None, max_words))\n", + "\n", + "\n", + "model.compile(\n", + " loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy']\n", + ")\n", + "\n", + "\n", + "print(\"Архитектура нейронной сети\")\n", + "model.summary()" + ], + "metadata": { + "id": "37SouWnhYOpH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "history = model.fit(\n", + " X_train,\n", + " y_train,\n", + " validation_split=0.2,\n", + " batch_size=64,\n", + " epochs=5,\n", + " verbose=1\n", + ")" + ], + "metadata": { + "id": "RKFjXgRTZOxV" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Проверяем достигнутое качество на валидационных данных\n", + "val_accuracy = history.history['val_accuracy'][-1]\n", + "print(f\"\\nТочность на валидационных данных: {val_accuracy:.4f}\")\n", + "if val_accuracy >= 0.8:\n", + " print(\"Цель достигнута: accuracy >= 0.8\")\n", + "else:\n", + " print(\"Цель не достигнута: accuracy < 0.8\")" + ], + "metadata": { + "id": "FYcoID1UZafM" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Пункт 10: Оценка качества обучения на тестовых данных\n", + "\n", + "# 1) Значение метрики качества классификации на тестовых данных\n", + "print(\"\\n1) Метрика качества на тестовых данных:\")\n", + "test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)\n", + "print(f\" Loss: {test_loss:.4f}\")\n", + "print(f\" Accuracy: {test_accuracy:.4f}\")" + ], + "metadata": { + "id": "lvqn53FDZdj0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 2) Отчет о качестве классификации тестовой выборки\n", + "y_score = model.predict(X_test, verbose=0)\n", + "\n", + "# Преобразуем вероятности в бинарные предсказания (порог 0.5)\n", + "y_pred = [1 if y_score[i, 0] >= 0.5 else 0 for i in range(len(y_score))]\n", + "\n", + "from sklearn.metrics import classification_report\n", + "print(classification_report(y_test, y_pred, labels=[0,1], target_names=['Negative','Positive']))" + ], + "metadata": { + "id": "2GbHQzH2Zo_D" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# 3) Построение ROC-кривой и вычисление AUC-ROC\n", + "from sklearn.metrics import roc_curve, auc, roc_auc_score\n", + "import matplotlib.pyplot as plt\n", + "\n", + "fpr, tpr, thresholds = roc_curve(y_test, y_score)\n", + "\n", + "plt.figure(figsize=(8, 6))\n", + "plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc(fpr, tpr):.4f})')\n", + "plt.xlim([0.0, 1.0])\n", + "plt.ylim([0.0, 1.05])\n", + "plt.xlabel('False Positive Rate')\n", + "plt.ylabel('True Positive Rate')\n", + "plt.title('ROC Curve')\n", + "plt.legend(loc=\"lower right\")\n", + "plt.grid(True)\n", + "plt.show()" + ], + "metadata": { + "id": "7bclQvRoZu0F" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Вычисляем AUC-ROC\n", + "auc_roc = roc_auc_score(y_test, y_score)\n", + "print(f\" Площадь под ROC-кривой (AUC-ROC): {auc_roc:.4f}\")" + ], + "metadata": { + "id": "StkTG5k7ZxxD" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file