{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "7kDlfTCMN-n2" }, "outputs": [], "source": [ "import tensorflow as tf\n", "device_name=tf.test.gpu_device_name()\n", "if device_name!='/device:GPU:0':\n", " raise SystemError ('GPUdevicenotfound')\n", "print('FoundGPUat:{}'.format(device_name))" ] }, { "cell_type": "code", "source": [ "#загрузка датасета\n", "from keras.datasets import imdb\n", "vocabulary_size=5000\n", "index_from=3\n", "(X_train,y_train),(X_test,y_test)=imdb.load_data(path=\"imdb.npz\",num_words=vocabulary_size,skip_top=0,maxlen=None,seed=15,start_char=1,oov_char=2,index_from=index_from)" ], "metadata": { "id": "TJBFrj0mP_as" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Вывод размеров массивов данных\n", "print(\"Размеры обучающих данных:\")\n", "print(f\"X_train: {len(X_train)} \")\n", "print(f\"y_train: {y_train.shape}\")\n", "print(\"\\nРазмеры тестовых данных:\")\n", "print(f\"X_test: {len(X_test)} \")\n", "print(f\"y_test: {y_test.shape}\")" ], "metadata": { "id": "wHyWnCbvWauD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Пункт 3: Создание словаря для перевода индексов в слова\n", "# Загрузка словаря \"слово:индекс\"\n", "word_to_id = imdb.get_word_index()\n", "\n", "# Уточнение словаря\n", "word_to_id = {key: (value + index_from) for key, value in word_to_id.items()}\n", "word_to_id[\"\"] = 0\n", "word_to_id[\"\"] = 1\n", "word_to_id[\"\"] = 2\n", "word_to_id[\"\"] = 3" ], "metadata": { "id": "5A3EcfboWtHs" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Создание обратного словаря \"индекс:слово\"\n", "id_to_word = {value: key for key, value in word_to_id.items()}\n", "\n", "# Вывод одного отзыва из обучающего множества\n", "import random\n", "sample_index = random.randint(0, len(X_train)-1)\n", "print(f\"\\nОтзыв №{sample_index}\")\n", "print(\"Список индексов слов:\")\n", "print(X_train[sample_index])" ], "metadata": { "id": "QFWpDHnzWvVI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Преобразование в текст\n", "review_as_text = ' '.join(id_to_word.get(id, '') for id in X_train[sample_index])\n", "print(\"\\nОтзыв в виде текста:\")\n", "print(review_as_text)\n", "\n", "# Длина отзыва и метка класса\n", "print(f\"\\nДлина отзыва: {len(X_train[sample_index])} слов\")\n", "print(f\"Метка класса: {y_train[sample_index]} ({'Positive' if y_train[sample_index] == 1 else 'Negative'})\")" ], "metadata": { "id": "gcM-G1ZDWxye" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Пункт 4: Максимальная и минимальная длина отзыва в обучающем множестве\n", "lengths = [len(review) for review in X_train]\n", "max_length = max(lengths)\n", "min_length = min(lengths)\n", "print(f\"Максимальная длина отзыва: {max_length} слов\")\n", "print(f\"Минимальная длина отзыва: {min_length} слов\")" ], "metadata": { "id": "FvosOeEyW3gD" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Пункт 5: Предобработка данных\n", "from tensorflow.keras.utils import pad_sequences\n", "\n", "max_words = 500 # Выбранная единая длина\n", "\n", "X_train = pad_sequences(\n", " X_train,\n", " maxlen=max_words,\n", " value=0,\n", " padding='pre',\n", " truncating='post'\n", ")" ], "metadata": { "id": "O49mrPqEW6F6" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "X_test = pad_sequences(\n", " X_test,\n", " maxlen=max_words,\n", " value=0,\n", " padding='pre',\n", " truncating='post'\n", ")" ], "metadata": { "id": "XxoGW1oNW9Gu" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Пункт 6: Повторение п. 4 после предобработки\n", "print(f\"Длина всех отзывов: {X_train.shape[1]} слов\")\n" ], "metadata": { "id": "Zj4jmsjjW_OY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Пункт 7: Повторение п. 3 после предобработки\n", "print(\"Список индексов слов:\")\n", "print(X_train[sample_index])\n" ], "metadata": { "id": "3bg2bfazXDQV" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Преобразование в текст (игнорируем нулевые паддинги)\n", "review_after_preprocessing = ' '.join(\n", " id_to_word.get(id, '') for id in X_train[sample_index] if id != 0\n", ")\n", "print(\"\\nОтзыв в виде текста после предобработки:\")\n", "print(review_after_preprocessing)\n", "\n", "print(f\"\\nДлина отзыва после предобработки: {len([id for id in X_train[sample_index] if id != 0])} значимых слов\")\n", "print(f\"Общая длина с паддингом: {len(X_train[sample_index])}\")\n", "\n", "print(\"\\nВывод: После предобработки все отзывы приведены к единой длине 500 слов.\")\n", "\n" ], "metadata": { "id": "enEAfqWFXFYh" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Пункт 8: Вывод предобработанных массивов и их размерностей\n", "print(\"Предобработанное обучающее множество X_train (первые 5 примеров):\")\n", "print(X_train[:5])\n", "print(\"\\nПредобработанное тестовое множество X_test (первые 5 примеров):\")\n", "print(X_test[:5])\n", "print(f\"Размерность X_train после предобработки: {X_train.shape}\")\n", "print(f\"Размерность X_test после предобработки: {X_test.shape}\")\n", "print(f\"Размерность y_train: {y_train.shape}\")\n", "print(f\"Размерность y_test: {y_test.shape}\")" ], "metadata": { "id": "XhHax3ytXNfn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense\n", "import numpy as np\n", "\n", "model = Sequential()\n", "\n", "model.add(Embedding(\n", " input_dim=vocabulary_size,\n", " output_dim=32,\n", " input_length=max_words\n", "))\n", "\n", "\n", "model.add(LSTM(units=100))\n", "\n", "\n", "model.add(Dropout(rate=0.3))\n", "\n", "\n", "model.add(Dense(1, activation='sigmoid')) #\n", "\n", "\n", "model.build(input_shape=(None, max_words))\n", "\n", "\n", "model.compile(\n", " loss='binary_crossentropy',\n", " optimizer='adam',\n", " metrics=['accuracy']\n", ")\n", "\n", "\n", "print(\"Архитектура нейронной сети\")\n", "model.summary()" ], "metadata": { "id": "37SouWnhYOpH" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "history = model.fit(\n", " X_train,\n", " y_train,\n", " validation_split=0.2,\n", " batch_size=64,\n", " epochs=5,\n", " verbose=1\n", ")" ], "metadata": { "id": "RKFjXgRTZOxV" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Проверяем достигнутое качество на валидационных данных\n", "val_accuracy = history.history['val_accuracy'][-1]\n", "print(f\"\\nТочность на валидационных данных: {val_accuracy:.4f}\")\n", "if val_accuracy >= 0.8:\n", " print(\"Цель достигнута: accuracy >= 0.8\")\n", "else:\n", " print(\"Цель не достигнута: accuracy < 0.8\")" ], "metadata": { "id": "FYcoID1UZafM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Пункт 10: Оценка качества обучения на тестовых данных\n", "\n", "# 1) Значение метрики качества классификации на тестовых данных\n", "print(\"\\n1) Метрика качества на тестовых данных:\")\n", "test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)\n", "print(f\" Loss: {test_loss:.4f}\")\n", "print(f\" Accuracy: {test_accuracy:.4f}\")" ], "metadata": { "id": "lvqn53FDZdj0" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 2) Отчет о качестве классификации тестовой выборки\n", "y_score = model.predict(X_test, verbose=0)\n", "\n", "# Преобразуем вероятности в бинарные предсказания (порог 0.5)\n", "y_pred = [1 if y_score[i, 0] >= 0.5 else 0 for i in range(len(y_score))]\n", "\n", "from sklearn.metrics import classification_report\n", "print(classification_report(y_test, y_pred, labels=[0,1], target_names=['Negative','Positive']))" ], "metadata": { "id": "2GbHQzH2Zo_D" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 3) Построение ROC-кривой и вычисление AUC-ROC\n", "from sklearn.metrics import roc_curve, auc, roc_auc_score\n", "import matplotlib.pyplot as plt\n", "\n", "fpr, tpr, thresholds = roc_curve(y_test, y_score)\n", "\n", "plt.figure(figsize=(8, 6))\n", "plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc(fpr, tpr):.4f})')\n", "plt.xlim([0.0, 1.0])\n", "plt.ylim([0.0, 1.05])\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC Curve')\n", "plt.legend(loc=\"lower right\")\n", "plt.grid(True)\n", "plt.show()" ], "metadata": { "id": "7bclQvRoZu0F" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# Вычисляем AUC-ROC\n", "auc_roc = roc_auc_score(y_test, y_score)\n", "print(f\" Площадь под ROC-кривой (AUC-ROC): {auc_roc:.4f}\")" ], "metadata": { "id": "StkTG5k7ZxxD" }, "execution_count": null, "outputs": [] } ] }