diff --git a/labworks/LW4/IS_LR4.ipynb b/labworks/LW4/IS_LR4.ipynb new file mode 100644 index 0000000..45039ad --- /dev/null +++ b/labworks/LW4/IS_LR4.ipynb @@ -0,0 +1 @@ +{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"gpuType":"T4","mount_file_id":"1dqGOXcLwVQwLYdgcud5XOoxGoEaRIQsZ","authorship_tag":"ABX9TyN6RT6AzdQtvrIpuV+YgB7t"},"kernelspec":{"name":"python3","display_name":"Python 3"},"language_info":{"name":"python"},"accelerator":"GPU"},"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"Sk1rdDVJ_RSy","executionInfo":{"status":"ok","timestamp":1765315324986,"user_tz":-180,"elapsed":45,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}}},"outputs":[],"source":["import os\n","os.chdir('/content/drive/MyDrive/Colab Notebooks/IS_LR4')"]},{"cell_type":"code","source":["import tensorflow as tf\n","device_name = tf.test.gpu_device_name()\n","if device_name != '/device:GPU:0':\n"," raise SystemError('GPU device not found')\n","print('Found GPU at: {}'.format(device_name))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ltCjrjZG_mAf","executionInfo":{"status":"ok","timestamp":1765315352692,"user_tz":-180,"elapsed":10426,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"b69dc0b6-5ca3-44bd-e130-033f24915ce4"},"execution_count":2,"outputs":[{"output_type":"stream","name":"stdout","text":["Found GPU at: /device:GPU:0\n"]}]},{"cell_type":"code","source":["# загрузка датасета\n","from keras.datasets import imdb\n","vocabulary_size = 5000\n","index_from = 3\n","(X_train, y_train), (X_test, y_test) = imdb.load_data(path=\"imdb.npz\",\n"," num_words=vocabulary_size,\n"," skip_top=0,\n"," maxlen=None,\n"," seed=35,\n"," start_char=1,\n"," oov_char=2,\n"," index_from=index_from\n"," )\n","print('Shape of X train:', X_train.shape)\n","print('Shape of y train:', y_train.shape)\n","print('Shape of X test:', X_test.shape)\n","print('Shape of y test:', y_test.shape)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"AEb6ZdYOALQl","executionInfo":{"status":"ok","timestamp":1765316082917,"user_tz":-180,"elapsed":3530,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"be205b9e-f927-4371-f860-0a7d603b0097"},"execution_count":10,"outputs":[{"output_type":"stream","name":"stdout","text":["Shape of X train: (25000,)\n","Shape of y train: (25000,)\n","Shape of X test: (25000,)\n","Shape of y test: (25000,)\n"]}]},{"cell_type":"code","source":["# создание словаря для перевода индексов в слова\n","# заргузка словаря \"слово:индекс\"\n","word_to_id = imdb.get_word_index()\n","# уточнение словаря\n","word_to_id = {key:(value + index_from) for key,value in word_to_id.items()}\n","word_to_id[\"\"] = 0\n","word_to_id[\"\"] = 1\n","word_to_id[\"\"] = 2\n","word_to_id[\"\"] = 3\n","# создание обратного словаря \"индекс:слово\"\n","id_to_word = {value:key for key,value in word_to_id.items()}\n","\n","review_indices = X_train[19]\n","print(\"Review - index:\\n\", review_indices)\n","\n","review_text = \" \".join(id_to_word.get(i, \"?\") for i in review_indices)\n","print(\"\\nReview - text:\\n\", review_text)\n","\n","print(\"\\nReview length:\", len(review_indices))\n","\n","label = y_train[19]\n","class_name = \"Positive\" if label == 1 else \"Negative\"\n","print(\"Class label:\", label, \"| Class name:\", class_name)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"WywjkQNXClqM","executionInfo":{"status":"ok","timestamp":1765316515983,"user_tz":-180,"elapsed":85,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"31b86852-c762-47c6-b6c9-69c07a020178"},"execution_count":12,"outputs":[{"output_type":"stream","name":"stdout","text":["Review - index:\n"," [1, 608, 50, 26, 84, 37, 144, 24, 67, 14, 20, 10, 10, 300, 92, 67, 12, 48, 25, 92, 40, 2006, 42, 328, 1285, 241, 92, 40, 12, 48, 25, 188, 4154, 34, 4, 2, 342, 92, 67, 12, 48, 25, 181, 6, 622, 3783, 20, 10, 10, 4, 360, 7, 25, 521, 92, 1135, 8, 67, 736, 349, 45, 163, 45, 2812, 45, 6, 1917, 2, 7, 175, 78, 3783, 4896, 573, 8, 132, 2552, 2, 83, 4715, 312, 1285, 92, 2457, 4, 3028, 11, 3850, 364, 1317, 253, 7, 2, 2, 1022, 4106, 5, 4391, 2, 17, 73, 17, 6, 378, 7, 1139, 4139, 531, 34, 2, 3409, 5, 2, 2, 52, 8, 67, 4841, 2, 397, 157, 99, 13, 1498, 32, 4, 96, 143, 1254, 2, 643, 916, 21, 52]\n","\n","Review - text:\n"," ok there are people who should not see this movie br br 1 don't see it if you don't like satire or black humour 2 don't like it if you got offended by the 3 don't see it if you want a serious superhero movie br br the rest of you run don't walk to see mystery men it's funny it's quirky it's a delightful of every bad superhero cliche known to man occasional into junior high humour don't ruin the tongue in cheek low key fun of ben stiller and hank as well as a couple of amusing smaller parts by rush and good to see louise getting work too i laughed all the way through utterly somewhat weird but good\n","\n","Review length: 134\n","Class label: 1 | Class name: Positive\n"]}]},{"cell_type":"code","source":["print(\"Max review length:\", len(max(X_train, key=len)))\n","print(\"Min review length:\", len(min(X_train, key=len)))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"rjanZPAMEZ4g","executionInfo":{"status":"ok","timestamp":1765316620934,"user_tz":-180,"elapsed":12,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"06e1f94b-8aee-45b0-e771-5168f941ffee"},"execution_count":13,"outputs":[{"output_type":"stream","name":"stdout","text":["Max review length: 2494\n","Min review length: 11\n"]}]},{"cell_type":"code","source":["# предобработка данных\n","from tensorflow.keras.utils import pad_sequences\n","max_words = 500\n","X_train = pad_sequences(X_train, maxlen=max_words, value=0, padding='pre', truncating='post')\n","X_test = pad_sequences(X_test, maxlen=max_words, value=0, padding='pre', truncating='post')"],"metadata":{"id":"XdwfZ6W3EnPW","executionInfo":{"status":"ok","timestamp":1765316656611,"user_tz":-180,"elapsed":917,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}}},"execution_count":14,"outputs":[]},{"cell_type":"code","source":["print(\"Max review length:\", len(max(X_train, key=len)))\n","print(\"Min review length:\", len(min(X_train, key=len)))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"_ioXpqEbEtb7","executionInfo":{"status":"ok","timestamp":1765316681395,"user_tz":-180,"elapsed":6,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"40a46ef6-8a3c-48a0-ef7f-b0ca496b6af3"},"execution_count":15,"outputs":[{"output_type":"stream","name":"stdout","text":["Max review length: 500\n","Min review length: 500\n"]}]},{"cell_type":"code","source":["review_indices = X_train[19]\n","print(\"Review - index:\\n\", review_indices)\n","\n","review_text = \" \".join(id_to_word.get(i, \"?\") for i in review_indices)\n","print(\"\\nReview - text:\\n\", review_text)\n","\n","print(\"\\nReview length:\", len(review_indices))\n","\n","label = y_train[19]\n","class_name = \"Positive\" if label == 1 else \"Negative\"\n","print(\"Class label:\", label, \"| Class name:\", class_name)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"vOAcIjEkE8K6","executionInfo":{"status":"ok","timestamp":1765316741347,"user_tz":-180,"elapsed":16,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"e0a4954a-086e-4096-d58d-227323934389"},"execution_count":16,"outputs":[{"output_type":"stream","name":"stdout","text":["Review - index:\n"," [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"," 0 0 1 608 50 26 84 37 144 24 67 14 20 10\n"," 10 300 92 67 12 48 25 92 40 2006 42 328 1285 241\n"," 92 40 12 48 25 188 4154 34 4 2 342 92 67 12\n"," 48 25 181 6 622 3783 20 10 10 4 360 7 25 521\n"," 92 1135 8 67 736 349 45 163 45 2812 45 6 1917 2\n"," 7 175 78 3783 4896 573 8 132 2552 2 83 4715 312 1285\n"," 92 2457 4 3028 11 3850 364 1317 253 7 2 2 1022 4106\n"," 5 4391 2 17 73 17 6 378 7 1139 4139 531 34 2\n"," 3409 5 2 2 52 8 67 4841 2 397 157 99 13 1498\n"," 32 4 96 143 1254 2 643 916 21 52]\n","\n","Review - text:\n"," ok there are people who should not see this movie br br 1 don't see it if you don't like satire or black humour 2 don't like it if you got offended by the 3 don't see it if you want a serious superhero movie br br the rest of you run don't walk to see mystery men it's funny it's quirky it's a delightful of every bad superhero cliche known to man occasional into junior high humour don't ruin the tongue in cheek low key fun of ben stiller and hank as well as a couple of amusing smaller parts by rush and good to see louise getting work too i laughed all the way through utterly somewhat weird but good\n","\n","Review length: 500\n","Class label: 1 | Class name: Positive\n"]}]},{"cell_type":"code","source":["print(\"Preprocessed training set X_train (first 3 examples):\")\n","print(X_train[:3])\n","\n","print(\"\\nPreprocessed training set X_test (first 3 examples):\")\n","print(X_test[:3])\n","\n","\n","print(\"Size of X_train:\", X_train.shape)\n","print(\"Size of y_train:\", y_train.shape)\n","print(\"Size of X_test:\", X_test.shape)\n","print(\"Size of y_test:\", y_test.shape)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"e1-rP9DOFo0M","executionInfo":{"status":"ok","timestamp":1765317120012,"user_tz":-180,"elapsed":15,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"7346eb8b-c4c2-4b51-df86-292c841e9c0f"},"execution_count":21,"outputs":[{"output_type":"stream","name":"stdout","text":["Preprocessed training set X_train (first 3 examples):\n","[[ 0 0 0 ... 8 591 1462]\n"," [ 0 0 0 ... 28 35 585]\n"," [ 0 0 0 ... 11 2 2]]\n","\n","Preprocessed training set X_test (first 3 examples):\n","[[ 0 0 0 ... 14 356 22]\n"," [ 0 0 0 ... 301 87 22]\n"," [ 0 0 0 ... 46 7 158]]\n","Size of X_train: (25000, 500)\n","Size of y_train: (25000,)\n","Size of X_test: (25000, 500)\n","Size of y_test: (25000,)\n"]}]},{"cell_type":"code","source":["from tensorflow.keras.models import Sequential\n","from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense\n","\n","\n","model = Sequential()\n","model.add(Embedding(\n"," input_dim=vocabulary_size + index_from,\n"," output_dim=32,\n"," input_length=max_words\n","))\n","model.add(LSTM(67))\n","model.add(Dropout(0.5))\n","model.add(Dense(1, activation='sigmoid'))\n","\n","model.compile(\n"," loss='binary_crossentropy',\n"," optimizer='adam',\n"," metrics=['accuracy']\n",")\n","\n","model.build(input_shape=(None, max_words))\n","model.summary()\n","\n","# Обучение модели\n","history = model.fit(\n"," X_train,\n"," y_train,\n"," epochs=5,\n"," batch_size=64,\n"," validation_split=0.2,\n"," verbose=1\n",")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":485},"id":"gx2Lz72DGfzA","executionInfo":{"status":"ok","timestamp":1765317634152,"user_tz":-180,"elapsed":56373,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"50a5e229-abf6-4c48-b279-586009918627"},"execution_count":22,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.12/dist-packages/keras/src/layers/core/embedding.py:97: UserWarning: Argument `input_length` is deprecated. Just remove it.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["\u001b[1mModel: \"sequential\"\u001b[0m\n"],"text/html":["
Model: \"sequential\"\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n","┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n","┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n","│ embedding (\u001b[38;5;33mEmbedding\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m500\u001b[0m, \u001b[38;5;34m32\u001b[0m) │ \u001b[38;5;34m160,096\u001b[0m │\n","├─────────────────────────────────┼────────────────────────┼───────────────┤\n","│ lstm (\u001b[38;5;33mLSTM\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m67\u001b[0m) │ \u001b[38;5;34m26,800\u001b[0m │\n","├─────────────────────────────────┼────────────────────────┼───────────────┤\n","│ dropout (\u001b[38;5;33mDropout\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m67\u001b[0m) │ \u001b[38;5;34m0\u001b[0m │\n","├─────────────────────────────────┼────────────────────────┼───────────────┤\n","│ dense (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m68\u001b[0m │\n","└─────────────────────────────────┴────────────────────────┴───────────────┘\n"],"text/html":["
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n","┃ Layer (type)                     Output Shape                  Param # ┃\n","┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n","│ embedding (Embedding)           │ (None, 500, 32)        │       160,096 │\n","├─────────────────────────────────┼────────────────────────┼───────────────┤\n","│ lstm (LSTM)                     │ (None, 67)             │        26,800 │\n","├─────────────────────────────────┼────────────────────────┼───────────────┤\n","│ dropout (Dropout)               │ (None, 67)             │             0 │\n","├─────────────────────────────────┼────────────────────────┼───────────────┤\n","│ dense (Dense)                   │ (None, 1)              │            68 │\n","└─────────────────────────────────┴────────────────────────┴───────────────┘\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["\u001b[1m Total params: \u001b[0m\u001b[38;5;34m186,964\u001b[0m (730.33 KB)\n"],"text/html":["
 Total params: 186,964 (730.33 KB)\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m186,964\u001b[0m (730.33 KB)\n"],"text/html":["
 Trainable params: 186,964 (730.33 KB)\n","
\n"]},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"],"text/html":["
 Non-trainable params: 0 (0.00 B)\n","
\n"]},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Epoch 1/5\n","\u001b[1m313/313\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m13s\u001b[0m 25ms/step - accuracy: 0.6426 - loss: 0.6635 - val_accuracy: 0.6048 - val_loss: 0.6939\n","Epoch 2/5\n","\u001b[1m313/313\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m17s\u001b[0m 24ms/step - accuracy: 0.5151 - loss: 0.7202 - val_accuracy: 0.6084 - val_loss: 0.6766\n","Epoch 3/5\n","\u001b[1m313/313\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 23ms/step - accuracy: 0.5620 - loss: 0.6804 - val_accuracy: 0.7786 - val_loss: 0.5682\n","Epoch 4/5\n","\u001b[1m313/313\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m10s\u001b[0m 22ms/step - accuracy: 0.7489 - loss: 0.5362 - val_accuracy: 0.7468 - val_loss: 0.5106\n","Epoch 5/5\n","\u001b[1m313/313\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m8s\u001b[0m 24ms/step - accuracy: 0.8451 - loss: 0.3959 - val_accuracy: 0.8556 - val_loss: 0.3406\n"]}]},{"cell_type":"code","source":["test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)\n","\n","print(\"Classification results\")\n","print(f\"Test accuracy: {test_accuracy:.4f}\")"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"izA_1WRRI5ba","executionInfo":{"status":"ok","timestamp":1765317817014,"user_tz":-180,"elapsed":7047,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"8445ec24-49ae-4868-c80a-31bcbb0b6baf"},"execution_count":23,"outputs":[{"output_type":"stream","name":"stdout","text":["Classification results\n","Test accuracy: 0.8519\n"]}]},{"cell_type":"code","source":["y_score = model.predict(X_test)\n","y_pred = [1 if y_score[i,0]>=0.5 else 0 for i in range(len(y_score))]\n","from sklearn.metrics import classification_report\n","print(classification_report(y_test, y_pred, labels = [0, 1], target_names=['Negative', 'Positive']))"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"S8NHPV47JkoZ","executionInfo":{"status":"ok","timestamp":1765317966588,"user_tz":-180,"elapsed":10535,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"4ea613c3-c530-4b0c-c32b-cb6871ac16f5"},"execution_count":24,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[1m782/782\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m9s\u001b[0m 12ms/step\n"," precision recall f1-score support\n","\n"," Negative 0.85 0.86 0.85 12500\n"," Positive 0.85 0.85 0.85 12500\n","\n"," accuracy 0.85 25000\n"," macro avg 0.85 0.85 0.85 25000\n","weighted avg 0.85 0.85 0.85 25000\n","\n"]}]},{"cell_type":"code","source":["from sklearn.metrics import roc_curve, auc\n","import matplotlib.pyplot as plt\n","fpr, tpr, thresholds = roc_curve(y_test, y_score)\n","plt.plot(fpr, tpr)\n","plt.grid()\n","plt.xlabel('False Positive Rate')\n","plt.ylabel('True Positive Rate')\n","plt.title('ROC')\n","plt.show()\n","print('Area under ROC is', auc(fpr, tpr))"],"metadata":{"id":"CdHMBHqYKR7V","executionInfo":{"status":"ok","timestamp":1765318141382,"user_tz":-180,"elapsed":498,"user":{"displayName":"Мирон Романов","userId":"18135774377279153892"}},"outputId":"d5b940b0-a18e-40ba-c87c-51297487ab85","colab":{"base_uri":"https://localhost:8080/","height":489}},"execution_count":25,"outputs":[{"output_type":"display_data","data":{"text/plain":["
"],"image/png":"\n"},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Area under ROC is 0.925042592\n"]}]}]} \ No newline at end of file diff --git a/labworks/LW4/model.png b/labworks/LW4/model.png new file mode 100644 index 0000000..8c7a3c9 Binary files /dev/null and b/labworks/LW4/model.png differ diff --git a/labworks/LW4/report.md b/labworks/LW4/report.md new file mode 100644 index 0000000..924e97d --- /dev/null +++ b/labworks/LW4/report.md @@ -0,0 +1,339 @@ +# Отчёт по лабораторной работе №4 +Романов Мирон, Юсуфов Юнус, А-01-22 +Бригада №9 + +### Подготовка среды + +```python +import os +os.chdir('/content/drive/MyDrive/Colab Notebooks/IS_LR4') +``` + +## 1) В среде Google Colab создали новый блокнот (notebook). Настроили блокнот для работы с аппаратным ускорителем GPU + +```python +import tensorflow as tf +device_name = tf.test.gpu_device_name() +if device_name != '/device:GPU:0': + raise SystemError('GPU device not found') +print('Found GPU at: {}'.format(device_name)) +``` +``` +Found GPU at: /device:GPU:0 +``` + +## 2) Загрузили набор данных IMDb, содержащий оцифрованные отзывы на фильмы, размеченные на два класса: позитивные и негативные + +```python +# загрузка датасета +from keras.datasets import imdb +vocabulary_size = 5000 +index_from = 3 +(X_train, y_train), (X_test, y_test) = imdb.load_data(path="imdb.npz", + num_words=vocabulary_size, + skip_top=0, + maxlen=None, + seed=35, + start_char=1, + oov_char=2, + index_from=index_from + ) +``` + +``` +Shape of X train: (25000,) +Shape of y train: (25000,) +Shape of X test: (25000,) +Shape of y test: (25000,) +``` + +## 3) Вывесли один отзыв из обучающего множества в виде списка индексов слов. Преобразовали список индексов в текст и вывести отзыв в виде текста. Вывели длину отзыва. Вывели метку класса данного отзыва и название класса (1 – Positive, 0 – Negative) + +```python +# создание словаря для перевода индексов в слова +# заргузка словаря "слово:индекс" +word_to_id = imdb.get_word_index() +# уточнение словаря +word_to_id = {key:(value + index_from) for key,value in word_to_id.items()} +word_to_id[""] = 0 +word_to_id[""] = 1 +word_to_id[""] = 2 +word_to_id[""] = 3 +# создание обратного словаря "индекс:слово" +id_to_word = {value:key for key,value in word_to_id.items()} + +review_indices = X_train[19] +print("Review - index:\n", review_indices) + +review_text = " ".join(id_to_word.get(i, "?") for i in review_indices) +print("\nReview - text:\n", review_text) + +print("\nReview length:", len(review_indices)) + +label = y_train[19] +class_name = "Positive" if label == 1 else "Negative" +print("Class label:", label, "| Class name:", class_name) +``` + +``` +Review - index: + [1, 608, 50, 26, 84, 37, 144, 24, 67, 14, 20, 10, 10, 300, 92, 67, 12, 48, 25, 92, 40, 2006, 42, 328, 1285, 241, 92, 40, 12, 48, 25, 188, 4154, 34, 4, 2, 342, 92, 67, 12, 48, 25, 181, 6, 622, 3783, 20, 10, 10, 4, 360, 7, 25, 521, 92, 1135, 8, 67, 736, 349, 45, 163, 45, 2812, 45, 6, 1917, 2, 7, 175, 78, 3783, 4896, 573, 8, 132, 2552, 2, 83, 4715, 312, 1285, 92, 2457, 4, 3028, 11, 3850, 364, 1317, 253, 7, 2, 2, 1022, 4106, 5, 4391, 2, 17, 73, 17, 6, 378, 7, 1139, 4139, 531, 34, 2, 3409, 5, 2, 2, 52, 8, 67, 4841, 2, 397, 157, 99, 13, 1498, 32, 4, 96, 143, 1254, 2, 643, 916, 21, 52] + +Review - text: + ok there are people who should not see this movie br br 1 don't see it if you don't like satire or black humour 2 don't like it if you got offended by the 3 don't see it if you want a serious superhero movie br br the rest of you run don't walk to see mystery men it's funny it's quirky it's a delightful of every bad superhero cliche known to man occasional into junior high humour don't ruin the tongue in cheek low key fun of ben stiller and hank as well as a couple of amusing smaller parts by rush and good to see louise getting work too i laughed all the way through utterly somewhat weird but good + +Review length: 134 +Class label: 1 | Class name: Positive +``` + +## 4) Вывели максимальную и минимальную длину отзыва в обучающем множестве + +```python +print("Max review length:", len(max(X_train, key=len))) +print("Min review length:", len(min(X_train, key=len))) +``` + +``` +Max review length: 2494 +Min review length: 11 +``` + +## 5) Провели предобработку данных + +```python +# предобработка данных +from tensorflow.keras.utils import pad_sequences +max_words = 500 +X_train = pad_sequences(X_train, maxlen=max_words, value=0, padding='pre', truncating='post') +X_test = pad_sequences(X_test, maxlen=max_words, value=0, padding='pre', truncating='post') +``` + +## 6) Повторили п. 4 + +```python +print("Max review length:", len(max(X_train, key=len))) +print("Min review length:", len(min(X_train, key=len))) +``` + +``` +Max review length: 500 +Min review length: 500 +``` + +## 7) Повторили п. 3. Сделали вывод о том, как отзыв преобразовался после предобработки + +```python +review_indices = X_train[19] +print("Review - index:\n", review_indices) + +review_text = " ".join(id_to_word.get(i, "?") for i in review_indices) +print("\nReview - text:\n", review_text) + +print("\nReview length:", len(review_indices)) + +label = y_train[19] +class_name = "Positive" if label == 1 else "Negative" +print("Class label:", label, "| Class name:", class_name) +``` + +``` +Review - index: + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + 0 0 1 608 50 26 84 37 144 24 67 14 20 10 + 10 300 92 67 12 48 25 92 40 2006 42 328 1285 241 + 92 40 12 48 25 188 4154 34 4 2 342 92 67 12 + 48 25 181 6 622 3783 20 10 10 4 360 7 25 521 + 92 1135 8 67 736 349 45 163 45 2812 45 6 1917 2 + 7 175 78 3783 4896 573 8 132 2552 2 83 4715 312 1285 + 92 2457 4 3028 11 3850 364 1317 253 7 2 2 1022 4106 + 5 4391 2 17 73 17 6 378 7 1139 4139 531 34 2 + 3409 5 2 2 52 8 67 4841 2 397 157 99 13 1498 + 32 4 96 143 1254 2 643 916 21 52] + +Review - text: + ok there are people who should not see this movie br br 1 don't see it if you don't like satire or black humour 2 don't like it if you got offended by the 3 don't see it if you want a serious superhero movie br br the rest of you run don't walk to see mystery men it's funny it's quirky it's a delightful of every bad superhero cliche known to man occasional into junior high humour don't ruin the tongue in cheek low key fun of ben stiller and hank as well as a couple of amusing smaller parts by rush and good to see louise getting work too i laughed all the way through utterly somewhat weird but good + +Review length: 500 +Class label: 1 | Class name: Positive +``` + +``` +После предобработки длина всех отзывов была приведена к 500 словам. Отзывы с меньшим количеством слов, чем 500, были дополнены нулями. +``` + +## 8) Вывели предобработанные массивы обучающих и тестовых данных и их размерности + +```python +print("Preprocessed training set X_train (first 3 examples):") +print(X_train[:3]) + +print("\nPreprocessed training set X_test (first 3 examples):") +print(X_test[:3]) + + +print("Size of X_train:", X_train.shape) +print("Size of y_train:", y_train.shape) +print("Size of X_test:", X_test.shape) +print("Size of y_test:", y_test.shape) +``` + +``` +Preprocessed training set X_train (first 3 examples): +[[ 0 0 0 ... 8 591 1462] + [ 0 0 0 ... 28 35 585] + [ 0 0 0 ... 11 2 2]] + +Preprocessed training set X_test (first 3 examples): +[[ 0 0 0 ... 14 356 22] + [ 0 0 0 ... 301 87 22] + [ 0 0 0 ... 46 7 158]] +Size of X_train: (25000, 500) +Size of y_train: (25000,) +Size of X_test: (25000, 500) +Size of y_test: (25000,) +``` + +## 9) Реализовали модель рекуррентной нейронной сети, состоящей из слоев Embedding, LSTM, Dropout, Dense, и обучили ее на обучающих данных с выделением части обучающих данных в качестве валидационных. Вывели информацию об архитектуре нейронной сети. Добилсь качества обучения по метрике accuracy не менее 0.8. + +```python +from tensorflow.keras.models import Sequential +from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense + + +model = Sequential() +model.add(Embedding( + input_dim=vocabulary_size + index_from, + output_dim=32, + input_length=max_words +)) +model.add(LSTM(67)) +model.add(Dropout(0.5)) +model.add(Dense(1, activation='sigmoid')) + +model.compile( + loss='binary_crossentropy', + optimizer='adam', + metrics=['accuracy'] +) + +model.build(input_shape=(None, max_words)) +model.summary() + +# Обучение модели +history = model.fit( + X_train, + y_train, + epochs=5, + batch_size=64, + validation_split=0.2, + verbose=1 +) +``` +![model](model.png) + +``` +Epoch 1/5 +313/313 ━━━━━━━━━━━━━━━━━━━━ 13s 25ms/step - accuracy: 0.6426 - loss: 0.6635 - val_accuracy: 0.6048 - val_loss: 0.6939 +Epoch 2/5 +313/313 ━━━━━━━━━━━━━━━━━━━━ 17s 24ms/step - accuracy: 0.5151 - loss: 0.7202 - val_accuracy: 0.6084 - val_loss: 0.6766 +Epoch 3/5 +313/313 ━━━━━━━━━━━━━━━━━━━━ 7s 23ms/step - accuracy: 0.5620 - loss: 0.6804 - val_accuracy: 0.7786 - val_loss: 0.5682 +Epoch 4/5 +313/313 ━━━━━━━━━━━━━━━━━━━━ 10s 22ms/step - accuracy: 0.7489 - loss: 0.5362 - val_accuracy: 0.7468 - val_loss: 0.5106 +Epoch 5/5 +313/313 ━━━━━━━━━━━━━━━━━━━━ 8s 24ms/step - accuracy: 0.8451 - loss: 0.3959 - val_accuracy: 0.8556 - val_loss: 0.3406 +``` + +## Вывели значение метрики качества классификации на тестовых +данных + +```python +test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0) + +print("Classification results") +print(f"Test accuracy: {test_accuracy:.4f}") +``` + +``` +Classification results +Test accuracy: 0.8519 +``` + +## Вывели отчет о качестве классификации тестовой выборки + +```python +y_score = model.predict(X_test) +y_pred = [1 if y_score[i,0]>=0.5 else 0 for i in range(len(y_score))] +from sklearn.metrics import classification_report +print(classification_report(y_test, y_pred, labels = [0, 1], target_names=['Negative', 'Positive'])) +``` + +``` + precision recall f1-score support + + Negative 0.85 0.86 0.85 12500 + Positive 0.85 0.85 0.85 12500 + + accuracy 0.85 25000 + macro avg 0.85 0.85 0.85 25000 +weighted avg 0.85 0.85 0.85 25000 +``` + +## Построили ROC-кривую по результату обработки тестовой +выборки и вычислить площадь под ROC-кривой (AUC ROC) + +```python +from sklearn.metrics import roc_curve, auc +import matplotlib.pyplot as plt +fpr, tpr, thresholds = roc_curve(y_test, y_score) +plt.plot(fpr, tpr) +plt.grid() +plt.xlabel('False Positive Rate') +plt.ylabel('True Positive Rate') +plt.title('ROC') +plt.show() +print('Area under ROC is', auc(fpr, tpr)) +``` + + +![roc](roc.png) + +``` +Area under ROC is 0.925042592 +``` +## 11) Выводы по результатам применения рекуррентной нейронной сети. + +``` +В данной лабораторной работе была реализована и обучена рекуррентная нейронная сеть с использованием слоя LSTM для задачи классификации тональности отзывов на основе данных IMDb. После этапа предобработки, состоявшего из приведения длины текстов к одному размеру и преобразования слов в числовые индексы, модель смогла качественно решать поставленную задачу. + +Результаты эксперимента показали, что точность на тестовой выборке составляет примерно 86%, то есть примерно 86% отзывов классифицируются верно. Классификационный отчёт с метриками precision, recall и f1-score продемонстрировал, что модель одинаково хорошо справляется как с положительными, так и с отрицательными отзывами. + +Анализ ROC-кривой и значения AUC ROC (примерно 0.93) подтвердил высокую эффективность модели в различении двух классов отзывов. + +В итоге, применение LSTM-рекуррентной нейронной сети оказалось удачным выбором для анализа тональности текста. Модель хорошо выявляет смысловые зависимости в последовательностях и достигает высокого качества классификации, что делает её пригодной для таких практических задач, как фильтрация, анализ эмоциональной окраски и обработка пользовательских отзывов. +``` \ No newline at end of file diff --git a/labworks/LW4/roc.png b/labworks/LW4/roc.png new file mode 100644 index 0000000..6382126 Binary files /dev/null and b/labworks/LW4/roc.png differ