diff --git a/README.md b/README.md index 6cf562b..1004668 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,8 @@ | 08.02.2023 | [Вводная лекция](lections/OATD_lec_1.pdf) [pptx](lections/OATD_lec_1.pptx) | | 13.02.2023 | [Клаcсификация данных. Основные понятия](lections/OATD_lec_2.pdf) [pptx](lections/OATD_lec_2.pptx) | | 20.02.2023 | [Интеллектуальный анализ текстов](lections/OATD_lec_3.pdf) [pptx](lections/OATD_lec_3.pptx) | +| 27.02.2023 | [Векторное представление слов](lections/OATD_lec_4.pdf) [pptx](lections/OATD_lec_4.pptx) | +| 06.03.2023 | [Обзор методов классификации](lections/OATD_lec_5.pdf) [pptx](lections/OATD_lec_5.pptx) | ## [Журнал группы](https://docs.google.com/spreadsheets/d/1Ylxaq-dMsEjq2ValfzxFbz9bQC7gCFsfHVfGE9VSx3Y/edit#gid=0) diff --git a/lections/OATD_lec_3.pptx b/lections/OATD_lec_3.pptx index 68b1948..147b3dd 100644 Binary files a/lections/OATD_lec_3.pptx and b/lections/OATD_lec_3.pptx differ diff --git a/lections/OATD_lec_4.pdf b/lections/OATD_lec_4.pdf new file mode 100644 index 0000000..64c5dbb Binary files /dev/null and b/lections/OATD_lec_4.pdf differ diff --git a/lections/OATD_lec_4.pptx b/lections/OATD_lec_4.pptx new file mode 100644 index 0000000..4d15a5a Binary files /dev/null and b/lections/OATD_lec_4.pptx differ diff --git a/lections/OATD_lec_5.pdf b/lections/OATD_lec_5.pdf new file mode 100644 index 0000000..91a50a2 Binary files /dev/null and b/lections/OATD_lec_5.pdf differ diff --git a/lections/OATD_lec_5.pptx b/lections/OATD_lec_5.pptx new file mode 100644 index 0000000..dad59ae Binary files /dev/null and b/lections/OATD_lec_5.pptx differ diff --git a/lections/notebooks/lec4_gensim.ipynb b/lections/notebooks/lec4_gensim.ipynb new file mode 100644 index 0000000..c05ff65 --- /dev/null +++ b/lections/notebooks/lec4_gensim.ipynb @@ -0,0 +1,353 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "849730f4", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: gensim in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (4.3.0)\n", + "Requirement already satisfied: numpy>=1.18.5 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (1.23.3)\n", + "Requirement already satisfied: FuzzyTM>=0.4.0 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (2.0.5)\n", + "Requirement already satisfied: Cython==0.29.32 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (0.29.32)\n", + "Requirement already satisfied: smart-open>=1.8.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (6.3.0)\n", + "Requirement already satisfied: scipy>=1.7.0 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (1.10.0)\n", + "Requirement already satisfied: pyfume in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from FuzzyTM>=0.4.0->gensim) (0.2.25)\n", + "Requirement already satisfied: pandas in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from FuzzyTM>=0.4.0->gensim) (1.5.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2022.7.1)\n", + "Requirement already satisfied: simpful in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (2.10.0)\n", + "Requirement already satisfied: fst-pso in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (1.8.1)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from python-dateutil>=2.8.1->pandas->FuzzyTM>=0.4.0->gensim) (1.16.0)\n", + "Requirement already satisfied: miniful in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim) (0.0.6)\n", + "Requirement already satisfied: requests in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.28.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (1.26.12)\n", + "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (3.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2022.9.14)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.1.1)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "[notice] A new release of pip available: 22.2.2 -> 23.0.1\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "!pip install gensim\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7fd6636b", + "metadata": {}, + "outputs": [], + "source": [ + "import gensim.downloader" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "527c1b9a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']\n" + ] + } + ], + "source": [ + "print(list(gensim.downloader.info()['models'].keys()))" + ] + }, + { + "cell_type": "markdown", + "id": "d1ef605d", + "metadata": {}, + "source": [ + "\n", + "# Word2Vec" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6f539977", + "metadata": {}, + "outputs": [], + "source": [ + "w2v_vectors = gensim.downloader.load('word2vec-ruscorpora-300')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "ca0e6543", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['весь_DET',\n", + " 'человек_NOUN',\n", + " 'мочь_VERB',\n", + " 'год_NOUN',\n", + " 'сказать_VERB',\n", + " 'время_NOUN',\n", + " 'говорить_VERB',\n", + " 'становиться_VERB',\n", + " 'знать_VERB',\n", + " 'самый_DET']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(w2v_vectors.key_to_index.keys())[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b7840d1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('кошка_NOUN', 0.7570087909698486),\n", + " ('котенок_NOUN', 0.6676193475723267),\n", + " ('пес_NOUN', 0.5633267164230347),\n", + " ('мяукать_VERB', 0.561974287033081),\n", + " ('тобик_NOUN', 0.5586473941802979),\n", + " ('фоксик_NOUN', 0.5572988986968994),\n", + " ('собака_NOUN', 0.5567899942398071),\n", + " ('мяучать_VERB', 0.5535756349563599),\n", + " ('харлашка_NOUN', 0.551755428314209),\n", + " ('котяра_NOUN', 0.5508568286895752)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w2v_vectors.most_similar('кот_NOUN')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "de05204a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([-1.52633622e-01, -6.22178875e-02, 8.02985206e-02, 3.76203880e-02,\n", + " -8.09977110e-03, -6.56392053e-02, 5.08799739e-02, 6.61313012e-02,\n", + " 1.57197528e-02, 9.78986733e-03, 4.73552682e-02, -2.55929027e-02,\n", + " 1.05717339e-01, -2.22761724e-02, 5.41505031e-02, -3.82993110e-02,\n", + " 5.26556484e-02, 1.17264939e-02, -1.46230776e-02, -1.95544884e-02,\n", + " 5.04205190e-02, -3.71097960e-02, 3.57442684e-02, 4.96631972e-02,\n", + " -5.57994805e-02, -3.11674438e-02, -2.20739599e-02, 1.07113965e-01,\n", + " -9.91705209e-02, -4.57583293e-02, -9.96095166e-02, 4.80931476e-02,\n", + " -1.33494905e-03, 3.51430699e-02, 2.42795311e-02, -2.34595835e-02,\n", + " 5.17160492e-03, -2.06816625e-02, 4.38127927e-02, -3.31711844e-02,\n", + " -2.07874626e-02, 6.72167316e-02, -7.74500072e-02, 2.93545369e-02,\n", + " -1.46178985e-02, 4.10723649e-02, 8.69638026e-02, -3.46537703e-03,\n", + " 3.90354246e-02, -2.03978154e-03, 5.43198660e-02, 7.57279024e-02,\n", + " 1.48434611e-02, 8.33871886e-02, -2.87217349e-02, -3.09202913e-03,\n", + " -7.93954656e-02, -2.82405037e-02, -1.64566293e-01, -1.17127458e-02,\n", + " -2.68191863e-02, -1.14840917e-01, 4.07641158e-02, -1.52551448e-02,\n", + " 1.05389841e-01, -2.80199181e-02, -1.25609236e-02, 1.09363765e-01,\n", + " -1.05669824e-02, 1.92236323e-02, -2.05025654e-02, 3.25121842e-02,\n", + " 3.57208811e-02, -2.52568591e-02, 2.24481337e-02, 5.09182140e-02,\n", + " 6.63011149e-02, -6.96184263e-02, -5.87991485e-03, 3.19263488e-02,\n", + " 2.67947633e-02, 5.35315834e-02, 5.44695035e-02, 2.58983169e-02,\n", + " -7.08631724e-02, 1.04762614e-01, -6.68804273e-02, -1.38250962e-02,\n", + " 1.44148827e-01, 6.52979612e-02, 1.60416458e-02, -2.04468183e-02,\n", + " 3.70856933e-02, -3.04988828e-02, 1.09351687e-01, 1.64980050e-02,\n", + " 2.36458685e-02, -1.01091415e-02, -6.50116727e-02, -1.13031827e-01,\n", + " -1.19736008e-01, -5.59152151e-03, 1.64195765e-02, 8.24512169e-03,\n", + " -8.84061214e-03, 7.30062574e-02, 2.95458623e-04, 3.91627736e-02,\n", + " 6.22012243e-02, 1.01540620e-02, -2.01074360e-03, 9.14960168e-03,\n", + " -2.40149889e-02, -7.16753602e-02, -8.49208906e-02, 5.45662642e-02,\n", + " 2.19109673e-02, 9.25432891e-03, 2.24880818e-02, -3.62291490e-03,\n", + " 8.57939944e-02, -5.56841269e-02, -1.16740711e-01, 2.33066957e-02,\n", + " -8.18690881e-02, -1.44955916e-02, 3.33725065e-02, 3.03953364e-02,\n", + " 2.25391071e-02, -3.46978344e-02, -6.41057938e-02, 7.33885840e-02,\n", + " -2.90144072e-03, -2.75960714e-02, -2.21674796e-02, -3.96765396e-02,\n", + " -3.22195105e-02, 4.82296161e-02, 4.16103862e-02, 3.63796987e-02,\n", + " 2.58319732e-02, 7.23602101e-02, 1.09503092e-03, 8.37009493e-03,\n", + " 5.09082936e-02, -3.29718776e-02, -5.68303093e-02, 1.01079745e-02,\n", + " -8.52582380e-02, 1.99150909e-02, 2.33987775e-02, -3.49289179e-02,\n", + " -2.18948033e-02, -1.17089637e-02, 1.78485103e-02, -5.88125037e-03,\n", + " 2.24573947e-02, -7.76379481e-02, -2.46963687e-02, 2.34957393e-02,\n", + " -7.47927353e-02, -3.52633633e-02, 6.65142164e-02, -2.21630055e-02,\n", + " 9.85186771e-02, -4.27325964e-02, 2.38673016e-02, 3.69326621e-02,\n", + " 5.19271940e-03, -4.75301892e-02, -1.99485421e-02, 2.70965626e-03,\n", + " -7.23582553e-03, 8.48396868e-02, 6.64435774e-02, -9.35326666e-02,\n", + " 4.94468771e-02, 8.26572999e-02, -1.33822160e-02, -5.32249734e-03,\n", + " 4.29970361e-02, 8.93590376e-02, -1.27462680e-02, 2.74799261e-02,\n", + " -3.33027355e-02, 4.35785688e-02, 4.56295535e-02, 3.17847766e-02,\n", + " -9.68080908e-02, -6.77153543e-02, -9.52497870e-02, -8.87092575e-03,\n", + " -4.08960059e-02, -5.09431772e-02, 2.54585221e-02, 5.80319017e-02,\n", + " 5.08921407e-02, -5.23761436e-02, -2.77449843e-02, 7.23702163e-02,\n", + " -9.36738960e-03, 8.10077041e-03, 3.52279693e-02, -1.19305283e-01,\n", + " -3.82529870e-02, -8.29238147e-02, -8.81364495e-02, 1.62167493e-02,\n", + " 2.68793292e-02, -3.83929200e-02, -2.57957950e-02, -1.86822563e-02,\n", + " -5.47099225e-02, -5.65230772e-02, -1.98926777e-02, 3.54687981e-02,\n", + " 1.35690883e-01, 8.04331973e-02, 1.92622133e-02, 5.81734739e-02,\n", + " -5.02377190e-02, 2.47635460e-03, -5.33336513e-02, 4.08107415e-02,\n", + " 1.18754342e-01, -7.40583912e-02, 7.48252273e-02, 1.46314219e-01,\n", + " 6.73391623e-03, -1.98812839e-02, -2.93681423e-05, -2.12224070e-02,\n", + " 1.70804688e-03, 3.52822542e-02, -1.65668026e-01, -4.84176865e-03,\n", + " 1.21439025e-02, 8.64505395e-02, -1.57235548e-01, 7.75721148e-02,\n", + " 5.35202436e-02, 1.17224073e-02, -7.53299072e-02, -3.44986990e-02,\n", + " -1.58868451e-02, 7.00481758e-02, 7.96044394e-02, -4.09048088e-02,\n", + " -1.46982130e-02, -1.24979429e-01, -4.20956686e-02, -8.43289569e-02,\n", + " -6.92764968e-02, 5.16316369e-02, 2.03369856e-02, -4.73499410e-02,\n", + " 9.15571675e-02, -5.96052743e-02, 1.10012911e-01, 2.55208667e-02,\n", + " -8.69148783e-03, -7.76273850e-03, 4.98862900e-02, 9.31067672e-03,\n", + " -3.49833667e-02, 1.33375779e-01, 8.40289332e-03, -3.45170535e-02,\n", + " -3.47062238e-02, -9.73994732e-02, -2.54784450e-02, -1.39390659e-02,\n", + " -3.32783237e-02, 9.36794057e-02, 3.47191617e-02, 2.80651636e-02,\n", + " 6.58571906e-03, 3.73428725e-02, -3.32412347e-02, -9.73492190e-02,\n", + " -7.07265735e-02, -7.01062232e-02, 3.67225669e-02, -2.62719765e-02,\n", + " 5.82991205e-02, -7.42069781e-02, 1.66096780e-02, -8.83689746e-02,\n", + " -1.62591994e-01, 4.79482487e-02, 5.83929494e-02, -1.04699671e-01,\n", + " 3.52650951e-03, 2.50546616e-02, 3.84298228e-02, -4.36684191e-02,\n", + " 5.68282753e-02, 6.57160487e-03, -3.02405991e-02, 2.51490474e-02],\n", + " dtype=float32)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w2v_vectors['кот_NOUN']" + ] + }, + { + "cell_type": "markdown", + "id": "3f93b5f6", + "metadata": {}, + "source": [ + "# GloVe" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "be870586", + "metadata": {}, + "outputs": [], + "source": [ + "glove_model = gensim.downloader.load(\"glove-twitter-25\") # load glove vectors\n" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "599d6406", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[-0.96419 -0.60978 0.67449 0.35113 0.41317 -0.21241 1.3796\n", + " 0.12854 0.31567 0.66325 0.3391 -0.18934 -3.325 -1.1491\n", + " -0.4129 0.2195 0.8706 -0.50616 -0.12781 -0.066965 0.065761\n", + " 0.43927 0.1758 -0.56058 0.13529 ]\n" + ] + }, + { + "data": { + "text/plain": [ + "[('dog', 0.9590820074081421),\n", + " ('monkey', 0.920357882976532),\n", + " ('bear', 0.9143136739730835),\n", + " ('pet', 0.9108031392097473),\n", + " ('girl', 0.8880629539489746),\n", + " ('horse', 0.8872726559638977),\n", + " ('kitty', 0.8870542049407959),\n", + " ('puppy', 0.886769711971283),\n", + " ('hot', 0.886525571346283),\n", + " ('lady', 0.8845519423484802)]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(glove_model['cat']) # word embedding for 'cat'\n", + "glove_model.most_similar(\"cat\") # show words that similar to word 'cat'" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "2db71cfb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.60927683" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "glove_model.similarity('cat', 'bus')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}