{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": { "id": "4W5M3DBiisqY" }, "outputs": [], "source": [ "import tensorflow as tf\n", "device_name = tf.test.gpu_device_name()\n", "if device_name != '/device:GPU:0':\n", " raise SystemError('GPU device not found')\n", "print('Found GPU at: {}'.format(device_name))\n" ] }, { "cell_type": "code", "source": [ "import os\n", "import random\n", "import numpy as np\n", "import tensorflow as tf\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from tensorflow.keras.datasets import imdb\n", "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", "from sklearn.metrics import classification_report, roc_curve, auc, roc_auc_score\n", "\n", "print(\"TensorFlow version:\", tf.__version__)" ], "metadata": { "id": "k7GXIRZQtYzM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "k = 5\n", "seed = 4 * k - 1\n", "\n", "vocabulary_size = 5000\n", "index_from = 3\n", "\n", "(X_train, y_train), (X_test, y_test) = imdb.load_data(\n", " path=\"imdb.npz\",\n", " num_words=vocabulary_size,\n", " skip_top=0,\n", " maxlen=None,\n", " seed=seed,\n", " start_char=1,\n", " oov_char=2,\n", " index_from=index_from\n", ")\n", "\n", "print(\"Размеры: X_train={}, y_train={}, X_test={}, y_test={}\".format(\n", " X_train.shape, y_train.shape, X_test.shape, y_test.shape))\n" ], "metadata": { "id": "pbNyjOJItin3" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 3) Отобразить один отзыв в виде индексов и в виде текста; вывести длину и метку\n", "# Сначала подготовим словарь id->word\n", "word_to_id = imdb.get_word_index()\n", "word_to_id = {key: (value + index_from) for key, value in word_to_id.items()}\n", "word_to_id[\"\"] = 0\n", "word_to_id[\"\"] = 1\n", "word_to_id[\"\"] = 2\n", "word_to_id[\"\"] = 3\n", "id_to_word = {value: key for key, value in word_to_id.items()}\n", "\n", "some_index = 0 # индекс примера в X_train; при необходимости изменить\n", "review_indices = X_train[some_index]\n", "print(\"Отзыв (список индексов):\", review_indices)\n", "review_text = ' '.join(id_to_word.get(i, \"\") for i in review_indices)\n", "print(\"\\nОтзыв (текст):\\n\", review_text)\n", "print(\"\\nДлина отзыва (число индексов):\", len(review_indices))\n", "print(\"Меткa класса (y):\", y_train[some_index],\n", " \"- название класса:\", (\"Positive\" if y_train[some_index] == 1 else \"Negative\"))\n" ], "metadata": { "id": "SVmnorZgt6Bv" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 4) Максимальная и минимальная длина отзыва в обучающем множестве\n", "max_len = len(max(X_train, key=len))\n", "min_len = len(min(X_train, key=len))\n", "print(\"Максимальная длина отзыва (в индексах):\", max_len)\n", "print(\"Минимальная длина отзыва (в индексах):\", min_len)" ], "metadata": { "id": "qIRCcZdNuG3f" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 5) Предобработка — приведение к единой длине\n", "max_words = 500\n", "X_train_prep = pad_sequences(X_train, maxlen=max_words, value=0, padding='pre', truncating='post')\n", "X_test_prep = pad_sequences(X_test, maxlen=max_words, value=0, padding='pre', truncating='post')\n", "print(\"Форма X_train_prep:\", X_train_prep.shape)\n", "print(\"Форма X_test_prep:\", X_test_prep.shape)" ], "metadata": { "id": "nxrBn8g2uKFY" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 6) Проверка: максимальная и минимальная длина после предобработки\n", "print(\"После предобработки: длина\", X_train_prep.shape[1])" ], "metadata": { "id": "Uvac7sCEuRPS" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 7) Показать тот же отзыв после предобработки и сравнить\n", "prep_review_indices = X_train_prep[some_index]\n", "print(\"Предобработанный отзыв (индексы):\", prep_review_indices)\n", "prep_review_text = ' '.join(id_to_word.get(i, \"\") for i in prep_review_indices if i != 0)\n", "print(\"\\nПредобработанный отзыв (текст, без ):\\n\", prep_review_text)\n", "print(\"\\nДлина предобработанного отзыва:\", len(prep_review_indices))" ], "metadata": { "id": "0TM-1v-DuWtH" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 8) Вывести массивы и размерности\n", "print(\"X_train_prep shape:\", X_train_prep.shape)\n", "print(\"X_test_prep shape: \", X_test_prep.shape)\n", "print(\"y_train shape:\", y_train.shape)\n", "print(\"y_test shape: \", y_test.shape)\n", "# Показать первые 3 примера предобработанных входов и меток\n", "for i in range(3):\n", " print(f\"\\nПример {i} (индексы):\", X_train_prep[i][:300], \"...\")\n", " print(f\"Метка:\", y_train[i])" ], "metadata": { "id": "XxQ_xW3oubMs" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 9) Создание и обучение модели\n", "vocab_size = vocabulary_size\n", "embedding_dim = 32\n", "input_length = max_words\n", "\n", "model = keras.Sequential([\n", " layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),\n", " layers.LSTM(64),\n", " layers.Dropout(0.3),\n", " layers.Dense(1, activation='sigmoid')\n", "])\n", "\n", "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "model.summary()\n", "\n", "# Обучение с выделением валидации\n", "history = model.fit(\n", " X_train_prep, y_train,\n", " epochs=4,\n", " batch_size=64,\n", " validation_split=0.2,\n", " verbose=1\n", ")" ], "metadata": { "id": "uM62eg4duyBd" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# 10) Оценка качества на тестовой выборке\n", "# Оценка метрик\n", "eval_results = model.evaluate(X_test_prep, y_test, verbose=1)\n", "print(\"Результаты оценки на тесте (loss, accuracy):\", eval_results)\n", "\n", "# Получение \"сырых\" предсказаний и бинарных меток\n", "y_score = model.predict(X_test_prep)\n", "y_pred = [1 if y_score[i,0] >= 0.5 else 0 for i in range(len(y_score))]\n", "\n", "# Отчёт о качестве классификации\n", "print(\"\\nClassification report:\\n\")\n", "print(classification_report(y_test, y_pred, labels=[0,1], target_names=['Negative','Positive']))\n", "\n", "# ROC-кривая и AUC\n", "fpr, tpr, thresholds = roc_curve(y_test, y_score)\n", "roc_auc = auc(fpr, tpr)\n", "print(\"\\nAUC ROC (ручной вычисление):\", roc_auc)\n", "print(\"AUC ROC (sklearn):\", roc_auc_score(y_test, y_score))\n", "\n", "# Построение ROC-кривой (в Colab отобразится график)\n", "import matplotlib.pyplot as plt\n", "plt.figure()\n", "plt.plot(fpr, tpr)\n", "plt.grid()\n", "plt.xlabel('False Positive Rate')\n", "plt.ylabel('True Positive Rate')\n", "plt.title('ROC')\n", "plt.show()" ], "metadata": { "id": "332eGXS9vJNm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# ищем индекс отзыва, длина которого > 500\n", "long_index = None\n", "for i, review in enumerate(X_train):\n", " if len(review) > 500:\n", " long_index = i\n", " break\n", "\n", "print(\"Найден индекс длинного отзыва:\", long_index)\n", "print(\"Исходная длина:\", len(X_train[long_index]))\n", "\n", "# исходные индексы\n", "orig = X_train[long_index]\n", "\n", "# преобразование индексов в текст\n", "def to_text(indices):\n", " return ' '.join(id_to_word.get(i, \"\") for i in indices)\n", "\n", "print(\"\\nИсходный отзыв (первые 50 индексов):\")\n", "print(orig[:50])\n", "print(\"\\nТекст (первые 50 токенов):\")\n", "print(to_text(orig[:50]))\n", "\n", "print(\"\\nИсходный отзыв (последние 50 индексов):\")\n", "print(orig[-50:])\n", "print(\"\\nТекст (последние 50 токенов):\")\n", "print(to_text(orig[-50:]))\n", "\n", "# предобработанный вариант\n", "prep = X_train_prep[long_index]\n", "\n", "print(\"\\nПредобработанный отзыв (длина):\", len(prep))\n", "\n", "print(\"\\nПосле предобработки (первые 50 индексов):\")\n", "print(prep[:50])\n", "print(\"\\nТекст (первые 50 токенов после обрезания):\")\n", "print(to_text(prep[:50]))\n", "\n", "print(\"\\nПосле предобработки (последние 50 индексов):\")\n", "print(prep[-50:])\n", "print(\"\\nТекст (последние 50 токенов после обрезания):\")\n", "print(to_text(prep[-50:]))\n" ], "metadata": { "id": "XHUmdTXl_vKb" }, "execution_count": null, "outputs": [] } ] }