Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

313 строки
12 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"gpuType": "T4"
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "4W5M3DBiisqY"
},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"device_name = tf.test.gpu_device_name()\n",
"if device_name != '/device:GPU:0':\n",
" raise SystemError('GPU device not found')\n",
"print('Found GPU at: {}'.format(device_name))\n"
]
},
{
"cell_type": "code",
"source": [
"import os\n",
"import random\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from tensorflow import keras\n",
"from tensorflow.keras import layers\n",
"from tensorflow.keras.datasets import imdb\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from sklearn.metrics import classification_report, roc_curve, auc, roc_auc_score\n",
"\n",
"print(\"TensorFlow version:\", tf.__version__)"
],
"metadata": {
"id": "k7GXIRZQtYzM"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"k = 5\n",
"seed = 4 * k - 1\n",
"\n",
"vocabulary_size = 5000\n",
"index_from = 3\n",
"\n",
"(X_train, y_train), (X_test, y_test) = imdb.load_data(\n",
" path=\"imdb.npz\",\n",
" num_words=vocabulary_size,\n",
" skip_top=0,\n",
" maxlen=None,\n",
" seed=seed,\n",
" start_char=1,\n",
" oov_char=2,\n",
" index_from=index_from\n",
")\n",
"\n",
"print(\"Размеры: X_train={}, y_train={}, X_test={}, y_test={}\".format(\n",
" X_train.shape, y_train.shape, X_test.shape, y_test.shape))\n"
],
"metadata": {
"id": "pbNyjOJItin3"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 3) Отобразить один отзыв в виде индексов и в виде текста; вывести длину и метку\n",
"# Сначала подготовим словарь id->word\n",
"word_to_id = imdb.get_word_index()\n",
"word_to_id = {key: (value + index_from) for key, value in word_to_id.items()}\n",
"word_to_id[\"<PAD>\"] = 0\n",
"word_to_id[\"<START>\"] = 1\n",
"word_to_id[\"<UNK>\"] = 2\n",
"word_to_id[\"<UNUSED>\"] = 3\n",
"id_to_word = {value: key for key, value in word_to_id.items()}\n",
"\n",
"some_index = 0 # индекс примера в X_train; при необходимости изменить\n",
"review_indices = X_train[some_index]\n",
"print(\"Отзыв (список индексов):\", review_indices)\n",
"review_text = ' '.join(id_to_word.get(i, \"<UNK>\") for i in review_indices)\n",
"print(\"\\nОтзыв (текст):\\n\", review_text)\n",
"print(\"\\nДлина отзыва (число индексов):\", len(review_indices))\n",
"print(\"Меткa класса (y):\", y_train[some_index],\n",
" \"- название класса:\", (\"Positive\" if y_train[some_index] == 1 else \"Negative\"))\n"
],
"metadata": {
"id": "SVmnorZgt6Bv"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 4) Максимальная и минимальная длина отзыва в обучающем множестве\n",
"max_len = len(max(X_train, key=len))\n",
"min_len = len(min(X_train, key=len))\n",
"print(\"Максимальная длина отзыва (в индексах):\", max_len)\n",
"print(\"Минимальная длина отзыва (в индексах):\", min_len)"
],
"metadata": {
"id": "qIRCcZdNuG3f"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 5) Предобработка — приведение к единой длине\n",
"max_words = 500\n",
"X_train_prep = pad_sequences(X_train, maxlen=max_words, value=0, padding='pre', truncating='post')\n",
"X_test_prep = pad_sequences(X_test, maxlen=max_words, value=0, padding='pre', truncating='post')\n",
"print(\"Форма X_train_prep:\", X_train_prep.shape)\n",
"print(\"Форма X_test_prep:\", X_test_prep.shape)"
],
"metadata": {
"id": "nxrBn8g2uKFY"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 6) Проверка: максимальная и минимальная длина после предобработки\n",
"print(\"После предобработки: длина\", X_train_prep.shape[1])"
],
"metadata": {
"id": "Uvac7sCEuRPS"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 7) Показать тот же отзыв после предобработки и сравнить\n",
"prep_review_indices = X_train_prep[some_index]\n",
"print(\"Предобработанный отзыв (индексы):\", prep_review_indices)\n",
"prep_review_text = ' '.join(id_to_word.get(i, \"<PAD>\") for i in prep_review_indices if i != 0)\n",
"print(\"\\nПредобработанный отзыв (текст, без <PAD>):\\n\", prep_review_text)\n",
"print(\"\\nДлина предобработанного отзыва:\", len(prep_review_indices))"
],
"metadata": {
"id": "0TM-1v-DuWtH"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 8) Вывести массивы и размерности\n",
"print(\"X_train_prep shape:\", X_train_prep.shape)\n",
"print(\"X_test_prep shape: \", X_test_prep.shape)\n",
"print(\"y_train shape:\", y_train.shape)\n",
"print(\"y_test shape: \", y_test.shape)\n",
"# Показать первые 3 примера предобработанных входов и меток\n",
"for i in range(3):\n",
" print(f\"\\nПример {i} (индексы):\", X_train_prep[i][:300], \"...\")\n",
" print(f\"Метка:\", y_train[i])"
],
"metadata": {
"id": "XxQ_xW3oubMs"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 9) Создание и обучение модели\n",
"vocab_size = vocabulary_size\n",
"embedding_dim = 32\n",
"input_length = max_words\n",
"\n",
"model = keras.Sequential([\n",
" layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=input_length),\n",
" layers.LSTM(64),\n",
" layers.Dropout(0.3),\n",
" layers.Dense(1, activation='sigmoid')\n",
"])\n",
"\n",
"model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
"model.summary()\n",
"\n",
"# Обучение с выделением валидации\n",
"history = model.fit(\n",
" X_train_prep, y_train,\n",
" epochs=4,\n",
" batch_size=64,\n",
" validation_split=0.2,\n",
" verbose=1\n",
")"
],
"metadata": {
"id": "uM62eg4duyBd"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# 10) Оценка качества на тестовой выборке\n",
"# Оценка метрик\n",
"eval_results = model.evaluate(X_test_prep, y_test, verbose=1)\n",
"print(\"Результаты оценки на тесте (loss, accuracy):\", eval_results)\n",
"\n",
"# Получение \"сырых\" предсказаний и бинарных меток\n",
"y_score = model.predict(X_test_prep)\n",
"y_pred = [1 if y_score[i,0] >= 0.5 else 0 for i in range(len(y_score))]\n",
"\n",
"# Отчёт о качестве классификации\n",
"print(\"\\nClassification report:\\n\")\n",
"print(classification_report(y_test, y_pred, labels=[0,1], target_names=['Negative','Positive']))\n",
"\n",
"# ROC-кривая и AUC\n",
"fpr, tpr, thresholds = roc_curve(y_test, y_score)\n",
"roc_auc = auc(fpr, tpr)\n",
"print(\"\\nAUC ROC (ручной вычисление):\", roc_auc)\n",
"print(\"AUC ROC (sklearn):\", roc_auc_score(y_test, y_score))\n",
"\n",
"# Построение ROC-кривой (в Colab отобразится график)\n",
"import matplotlib.pyplot as plt\n",
"plt.figure()\n",
"plt.plot(fpr, tpr)\n",
"plt.grid()\n",
"plt.xlabel('False Positive Rate')\n",
"plt.ylabel('True Positive Rate')\n",
"plt.title('ROC')\n",
"plt.show()"
],
"metadata": {
"id": "332eGXS9vJNm"
},
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"source": [
"# ищем индекс отзыва, длина которого > 500\n",
"long_index = None\n",
"for i, review in enumerate(X_train):\n",
" if len(review) > 500:\n",
" long_index = i\n",
" break\n",
"\n",
"print(\"Найден индекс длинного отзыва:\", long_index)\n",
"print(\"Исходная длина:\", len(X_train[long_index]))\n",
"\n",
"# исходные индексы\n",
"orig = X_train[long_index]\n",
"\n",
"# преобразование индексов в текст\n",
"def to_text(indices):\n",
" return ' '.join(id_to_word.get(i, \"<UNK>\") for i in indices)\n",
"\n",
"print(\"\\nИсходный отзыв (первые 50 индексов):\")\n",
"print(orig[:50])\n",
"print(\"\\nТекст (первые 50 токенов):\")\n",
"print(to_text(orig[:50]))\n",
"\n",
"print(\"\\nИсходный отзыв (последние 50 индексов):\")\n",
"print(orig[-50:])\n",
"print(\"\\nТекст (последние 50 токенов):\")\n",
"print(to_text(orig[-50:]))\n",
"\n",
"# предобработанный вариант\n",
"prep = X_train_prep[long_index]\n",
"\n",
"print(\"\\nПредобработанный отзыв (длина):\", len(prep))\n",
"\n",
"print(\"\\nПосле предобработки (первые 50 индексов):\")\n",
"print(prep[:50])\n",
"print(\"\\nТекст (первые 50 токенов после обрезания):\")\n",
"print(to_text(prep[:50]))\n",
"\n",
"print(\"\\nПосле предобработки (последние 50 индексов):\")\n",
"print(prep[-50:])\n",
"print(\"\\nТекст (последние 50 токенов после обрезания):\")\n",
"print(to_text(prep[-50:]))\n"
],
"metadata": {
"id": "XHUmdTXl_vKb"
},
"execution_count": null,
"outputs": []
}
]
}