diff --git a/lections/OATD_lec_4.pdf b/lections/OATD_lec_4.pdf index 64c5dbb..2b2ac94 100644 Binary files a/lections/OATD_lec_4.pdf and b/lections/OATD_lec_4.pdf differ diff --git a/lections/OATD_lec_4.pptx b/lections/OATD_lec_4.pptx index 4d15a5a..db90ba6 100644 Binary files a/lections/OATD_lec_4.pptx and b/lections/OATD_lec_4.pptx differ diff --git a/lections/notebooks/lec4_gensim.ipynb b/lections/notebooks/lec4_gensim.ipynb index c05ff65..f3a03a1 100644 --- a/lections/notebooks/lec4_gensim.ipynb +++ b/lections/notebooks/lec4_gensim.ipynb @@ -10,18 +10,34 @@ "name": "stdout", "output_type": "stream", "text": [ - "Requirement already satisfied: gensim in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (4.3.0)\n", + "Requirement already satisfied: gensim in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (4.3.0)" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip is available: 23.0.1 -> 24.0\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Requirement already satisfied: scipy>=1.7.0 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (1.10.0)\n", "Requirement already satisfied: numpy>=1.18.5 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (1.23.3)\n", - "Requirement already satisfied: FuzzyTM>=0.4.0 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (2.0.5)\n", - "Requirement already satisfied: Cython==0.29.32 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (0.29.32)\n", "Requirement already satisfied: smart-open>=1.8.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (6.3.0)\n", - "Requirement already satisfied: scipy>=1.7.0 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (1.10.0)\n", + "Requirement already satisfied: Cython==0.29.32 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (0.29.32)\n", + "Requirement already satisfied: FuzzyTM>=0.4.0 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (2.0.5)\n", "Requirement already satisfied: pyfume in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from FuzzyTM>=0.4.0->gensim) (0.2.25)\n", "Requirement already satisfied: pandas in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from FuzzyTM>=0.4.0->gensim) (1.5.3)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2022.7.1)\n", - "Requirement already satisfied: simpful in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (2.10.0)\n", "Requirement already satisfied: fst-pso in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (1.8.1)\n", + "Requirement already satisfied: simpful in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (2.10.0)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from python-dateutil>=2.8.1->pandas->FuzzyTM>=0.4.0->gensim) (1.16.0)\n", "Requirement already satisfied: miniful in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim) (0.0.6)\n", "Requirement already satisfied: requests in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.28.1)\n", @@ -30,15 +46,6 @@ "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2022.9.14)\n", "Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.1.1)\n" ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "[notice] A new release of pip available: 22.2.2 -> 23.0.1\n", - "[notice] To update, run: python.exe -m pip install --upgrade pip\n" - ] } ], "source": [ @@ -57,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "527c1b9a", "metadata": {}, "outputs": [ @@ -84,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "6f539977", "metadata": {}, "outputs": [], @@ -94,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "id": "ca0e6543", "metadata": {}, "outputs": [ @@ -113,7 +120,7 @@ " 'самый_DET']" ] }, - "execution_count": 8, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -124,37 +131,7 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "b7840d1e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[('кошка_NOUN', 0.7570087909698486),\n", - " ('котенок_NOUN', 0.6676193475723267),\n", - " ('пес_NOUN', 0.5633267164230347),\n", - " ('мяукать_VERB', 0.561974287033081),\n", - " ('тобик_NOUN', 0.5586473941802979),\n", - " ('фоксик_NOUN', 0.5572988986968994),\n", - " ('собака_NOUN', 0.5567899942398071),\n", - " ('мяучать_VERB', 0.5535756349563599),\n", - " ('харлашка_NOUN', 0.551755428314209),\n", - " ('котяра_NOUN', 0.5508568286895752)]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "w2v_vectors.most_similar('кот_NOUN')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, + "execution_count": 6, "id": "de05204a", "metadata": {}, "outputs": [ @@ -239,7 +216,7 @@ " dtype=float32)" ] }, - "execution_count": 13, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -248,6 +225,36 @@ "w2v_vectors['кот_NOUN']" ] }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b7840d1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('кошка_NOUN', 0.7570087909698486),\n", + " ('котенок_NOUN', 0.6676193475723267),\n", + " ('пес_NOUN', 0.5633267164230347),\n", + " ('мяукать_VERB', 0.561974287033081),\n", + " ('тобик_NOUN', 0.5586473941802979),\n", + " ('фоксик_NOUN', 0.5572988986968994),\n", + " ('собака_NOUN', 0.5567899942398071),\n", + " ('мяучать_VERB', 0.5535756349563599),\n", + " ('харлашка_NOUN', 0.551755428314209),\n", + " ('котяра_NOUN', 0.5508568286895752)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w2v_vectors.most_similar('кот_NOUN')" + ] + }, { "cell_type": "markdown", "id": "3f93b5f6", @@ -258,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "id": "be870586", "metadata": {}, "outputs": [], @@ -268,7 +275,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "id": "599d6406", "metadata": {}, "outputs": [ @@ -297,7 +304,7 @@ " ('lady', 0.8845519423484802)]" ] }, - "execution_count": 19, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -309,7 +316,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 10, "id": "2db71cfb", "metadata": {}, "outputs": [ @@ -319,7 +326,7 @@ "0.60927683" ] }, - "execution_count": 24, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } diff --git a/lections/notebooks/lec5_text2vec_classifier.ipynb b/lections/notebooks/lec5_text2vec_classifier.ipynb index d202ef1..4658805 100644 --- a/lections/notebooks/lec5_text2vec_classifier.ipynb +++ b/lections/notebooks/lec5_text2vec_classifier.ipynb @@ -594,27 +594,14 @@ " -0.34222007 1.78697008 -1.45404002 2.56643 -1.32184002\n", " -1.04677537 0.27867999 -12.95450976 -1.00809997 3.15975004\n", " -0.52662008 1.93239999 -0.89686999 -0.60924001 1.51628\n", - " -3.16624993 -0.89275002 1.86969995 -1.33607102 -2.23464306]\n", - "glove_data: 0 1 2 3 4 5 6 7 \\\n", - "0 -1.55058 -0.081683 0.279919 0.588469 -1.00551 -0.826139 6.18643 1.44845 \n", - "0 1.73610 0.742082 0.355460 -4.744110 1.41544 -0.342220 1.78697 -1.45404 \n", - "\n", - " 8 9 ... 15 16 17 18 19 \\\n", - "0 -0.71108 -1.14717 ... -0.430875 0.872347 -0.806399 0.27203 2.23922 \n", - "0 2.56643 -1.32184 ... -0.526620 1.932400 -0.896870 -0.60924 1.51628 \n", - "\n", - " 20 21 22 23 24 \n", - "0 -1.23572 -1.310711 -1.96934 -0.176410 -0.135300 \n", - "0 -3.16625 -0.892750 1.86970 -1.336071 -2.234643 \n", - "\n", - "[2 rows x 25 columns]\n" + " -3.16624993 -0.89275002 1.86969995 -1.33607102 -2.23464306]\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "C:\\Users\\Андрей\\AppData\\Local\\Temp\\ipykernel_29476\\129113310.py:17: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", + "C:\\Users\\Андрей\\AppData\\Local\\Temp\\ipykernel_8524\\2010506005.py:17: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n", " glove_data=glove_data.append(pd.DataFrame([one_doc]))\n" ] } @@ -623,7 +610,7 @@ "# Создадим датафрейм, в который будем сохранять вектор документа\n", "glove_data=pd.DataFrame()\n", "\n", - "# Пробегаем по каждой строке (по каждому документу)\n", + "# Пробегаем по каждой строке датафрейма (по каждому документу)\n", "for i in range(CV_text_data.shape[0]):\n", " \n", " # Вектор одного документа с размерностью glove-модели:\n", @@ -636,8 +623,7 @@ " print(word, ': ', glove_model[word])\n", " one_doc += glove_model[word]\n", " print(text_data[i], ': ', one_doc)\n", - " glove_data=glove_data.append(pd.DataFrame([one_doc])) \n", - "print('glove_data: ', glove_data)" + " glove_data=glove_data.append(pd.DataFrame([one_doc])) \n" ] }, { @@ -804,33 +790,6 @@ "glove_data\n" ] }, - { - "cell_type": "code", - "execution_count": 14, - "id": "cb6edbdf", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([ 1.73610002, 0.74208201, 0.35545996, -4.74411008,\n", - " 1.41543998, -0.34222007, 1.78697008, -1.45404002,\n", - " 2.56643 , -1.32184002, -1.04677537, 0.27867999,\n", - " -12.95450976, -1.00809997, 3.15975004, -0.52662008,\n", - " 1.93239999, -0.89686999, -0.60924001, 1.51628 ,\n", - " -3.16624993, -0.89275002, 1.86969995, -1.33607102,\n", - " -2.23464306])" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "one_doc" - ] - }, { "cell_type": "code", "execution_count": 15, @@ -1217,28 +1176,6 @@ "train_data_glove" ] }, - { - "cell_type": "code", - "execution_count": 16, - "id": "3a7ea7c6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "<1657x23297 sparse matrix of type ''\n", - "\twith 106580 stored elements in Compressed Sparse Row format>" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "train_data\n" - ] - }, { "cell_type": "code", "execution_count": 17,