{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "849730f4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already satisfied: gensim in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (4.3.0)\n", "Requirement already satisfied: numpy>=1.18.5 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (1.23.3)\n", "Requirement already satisfied: FuzzyTM>=0.4.0 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (2.0.5)\n", "Requirement already satisfied: Cython==0.29.32 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (0.29.32)\n", "Requirement already satisfied: smart-open>=1.8.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (6.3.0)\n", "Requirement already satisfied: scipy>=1.7.0 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from gensim) (1.10.0)\n", "Requirement already satisfied: pyfume in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from FuzzyTM>=0.4.0->gensim) (0.2.25)\n", "Requirement already satisfied: pandas in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from FuzzyTM>=0.4.0->gensim) (1.5.3)\n", "Requirement already satisfied: python-dateutil>=2.8.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2.8.2)\n", "Requirement already satisfied: pytz>=2020.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2022.7.1)\n", "Requirement already satisfied: simpful in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (2.10.0)\n", "Requirement already satisfied: fst-pso in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (1.8.1)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from python-dateutil>=2.8.1->pandas->FuzzyTM>=0.4.0->gensim) (1.16.0)\n", "Requirement already satisfied: miniful in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim) (0.0.6)\n", "Requirement already satisfied: requests in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.28.1)\n", "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (1.26.12)\n", "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (3.4)\n", "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2022.9.14)\n", "Requirement already satisfied: charset-normalizer<3,>=2 in c:\\users\\андрей\\appdata\\local\\programs\\python\\python39\\lib\\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.1.1)\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "[notice] A new release of pip available: 22.2.2 -> 23.0.1\n", "[notice] To update, run: python.exe -m pip install --upgrade pip\n" ] } ], "source": [ "!pip install gensim\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "7fd6636b", "metadata": {}, "outputs": [], "source": [ "import gensim.downloader" ] }, { "cell_type": "code", "execution_count": 6, "id": "527c1b9a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']\n" ] } ], "source": [ "print(list(gensim.downloader.info()['models'].keys()))" ] }, { "cell_type": "markdown", "id": "d1ef605d", "metadata": {}, "source": [ "\n", "# Word2Vec" ] }, { "cell_type": "code", "execution_count": 7, "id": "6f539977", "metadata": {}, "outputs": [], "source": [ "w2v_vectors = gensim.downloader.load('word2vec-ruscorpora-300')" ] }, { "cell_type": "code", "execution_count": 8, "id": "ca0e6543", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['весь_DET',\n", " 'человек_NOUN',\n", " 'мочь_VERB',\n", " 'год_NOUN',\n", " 'сказать_VERB',\n", " 'время_NOUN',\n", " 'говорить_VERB',\n", " 'становиться_VERB',\n", " 'знать_VERB',\n", " 'самый_DET']" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "list(w2v_vectors.key_to_index.keys())[:10]" ] }, { "cell_type": "code", "execution_count": 9, "id": "b7840d1e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('кошка_NOUN', 0.7570087909698486),\n", " ('котенок_NOUN', 0.6676193475723267),\n", " ('пес_NOUN', 0.5633267164230347),\n", " ('мяукать_VERB', 0.561974287033081),\n", " ('тобик_NOUN', 0.5586473941802979),\n", " ('фоксик_NOUN', 0.5572988986968994),\n", " ('собака_NOUN', 0.5567899942398071),\n", " ('мяучать_VERB', 0.5535756349563599),\n", " ('харлашка_NOUN', 0.551755428314209),\n", " ('котяра_NOUN', 0.5508568286895752)]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_vectors.most_similar('кот_NOUN')" ] }, { "cell_type": "code", "execution_count": 13, "id": "de05204a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([-1.52633622e-01, -6.22178875e-02, 8.02985206e-02, 3.76203880e-02,\n", " -8.09977110e-03, -6.56392053e-02, 5.08799739e-02, 6.61313012e-02,\n", " 1.57197528e-02, 9.78986733e-03, 4.73552682e-02, -2.55929027e-02,\n", " 1.05717339e-01, -2.22761724e-02, 5.41505031e-02, -3.82993110e-02,\n", " 5.26556484e-02, 1.17264939e-02, -1.46230776e-02, -1.95544884e-02,\n", " 5.04205190e-02, -3.71097960e-02, 3.57442684e-02, 4.96631972e-02,\n", " -5.57994805e-02, -3.11674438e-02, -2.20739599e-02, 1.07113965e-01,\n", " -9.91705209e-02, -4.57583293e-02, -9.96095166e-02, 4.80931476e-02,\n", " -1.33494905e-03, 3.51430699e-02, 2.42795311e-02, -2.34595835e-02,\n", " 5.17160492e-03, -2.06816625e-02, 4.38127927e-02, -3.31711844e-02,\n", " -2.07874626e-02, 6.72167316e-02, -7.74500072e-02, 2.93545369e-02,\n", " -1.46178985e-02, 4.10723649e-02, 8.69638026e-02, -3.46537703e-03,\n", " 3.90354246e-02, -2.03978154e-03, 5.43198660e-02, 7.57279024e-02,\n", " 1.48434611e-02, 8.33871886e-02, -2.87217349e-02, -3.09202913e-03,\n", " -7.93954656e-02, -2.82405037e-02, -1.64566293e-01, -1.17127458e-02,\n", " -2.68191863e-02, -1.14840917e-01, 4.07641158e-02, -1.52551448e-02,\n", " 1.05389841e-01, -2.80199181e-02, -1.25609236e-02, 1.09363765e-01,\n", " -1.05669824e-02, 1.92236323e-02, -2.05025654e-02, 3.25121842e-02,\n", " 3.57208811e-02, -2.52568591e-02, 2.24481337e-02, 5.09182140e-02,\n", " 6.63011149e-02, -6.96184263e-02, -5.87991485e-03, 3.19263488e-02,\n", " 2.67947633e-02, 5.35315834e-02, 5.44695035e-02, 2.58983169e-02,\n", " -7.08631724e-02, 1.04762614e-01, -6.68804273e-02, -1.38250962e-02,\n", " 1.44148827e-01, 6.52979612e-02, 1.60416458e-02, -2.04468183e-02,\n", " 3.70856933e-02, -3.04988828e-02, 1.09351687e-01, 1.64980050e-02,\n", " 2.36458685e-02, -1.01091415e-02, -6.50116727e-02, -1.13031827e-01,\n", " -1.19736008e-01, -5.59152151e-03, 1.64195765e-02, 8.24512169e-03,\n", " -8.84061214e-03, 7.30062574e-02, 2.95458623e-04, 3.91627736e-02,\n", " 6.22012243e-02, 1.01540620e-02, -2.01074360e-03, 9.14960168e-03,\n", " -2.40149889e-02, -7.16753602e-02, -8.49208906e-02, 5.45662642e-02,\n", " 2.19109673e-02, 9.25432891e-03, 2.24880818e-02, -3.62291490e-03,\n", " 8.57939944e-02, -5.56841269e-02, -1.16740711e-01, 2.33066957e-02,\n", " -8.18690881e-02, -1.44955916e-02, 3.33725065e-02, 3.03953364e-02,\n", " 2.25391071e-02, -3.46978344e-02, -6.41057938e-02, 7.33885840e-02,\n", " -2.90144072e-03, -2.75960714e-02, -2.21674796e-02, -3.96765396e-02,\n", " -3.22195105e-02, 4.82296161e-02, 4.16103862e-02, 3.63796987e-02,\n", " 2.58319732e-02, 7.23602101e-02, 1.09503092e-03, 8.37009493e-03,\n", " 5.09082936e-02, -3.29718776e-02, -5.68303093e-02, 1.01079745e-02,\n", " -8.52582380e-02, 1.99150909e-02, 2.33987775e-02, -3.49289179e-02,\n", " -2.18948033e-02, -1.17089637e-02, 1.78485103e-02, -5.88125037e-03,\n", " 2.24573947e-02, -7.76379481e-02, -2.46963687e-02, 2.34957393e-02,\n", " -7.47927353e-02, -3.52633633e-02, 6.65142164e-02, -2.21630055e-02,\n", " 9.85186771e-02, -4.27325964e-02, 2.38673016e-02, 3.69326621e-02,\n", " 5.19271940e-03, -4.75301892e-02, -1.99485421e-02, 2.70965626e-03,\n", " -7.23582553e-03, 8.48396868e-02, 6.64435774e-02, -9.35326666e-02,\n", " 4.94468771e-02, 8.26572999e-02, -1.33822160e-02, -5.32249734e-03,\n", " 4.29970361e-02, 8.93590376e-02, -1.27462680e-02, 2.74799261e-02,\n", " -3.33027355e-02, 4.35785688e-02, 4.56295535e-02, 3.17847766e-02,\n", " -9.68080908e-02, -6.77153543e-02, -9.52497870e-02, -8.87092575e-03,\n", " -4.08960059e-02, -5.09431772e-02, 2.54585221e-02, 5.80319017e-02,\n", " 5.08921407e-02, -5.23761436e-02, -2.77449843e-02, 7.23702163e-02,\n", " -9.36738960e-03, 8.10077041e-03, 3.52279693e-02, -1.19305283e-01,\n", " -3.82529870e-02, -8.29238147e-02, -8.81364495e-02, 1.62167493e-02,\n", " 2.68793292e-02, -3.83929200e-02, -2.57957950e-02, -1.86822563e-02,\n", " -5.47099225e-02, -5.65230772e-02, -1.98926777e-02, 3.54687981e-02,\n", " 1.35690883e-01, 8.04331973e-02, 1.92622133e-02, 5.81734739e-02,\n", " -5.02377190e-02, 2.47635460e-03, -5.33336513e-02, 4.08107415e-02,\n", " 1.18754342e-01, -7.40583912e-02, 7.48252273e-02, 1.46314219e-01,\n", " 6.73391623e-03, -1.98812839e-02, -2.93681423e-05, -2.12224070e-02,\n", " 1.70804688e-03, 3.52822542e-02, -1.65668026e-01, -4.84176865e-03,\n", " 1.21439025e-02, 8.64505395e-02, -1.57235548e-01, 7.75721148e-02,\n", " 5.35202436e-02, 1.17224073e-02, -7.53299072e-02, -3.44986990e-02,\n", " -1.58868451e-02, 7.00481758e-02, 7.96044394e-02, -4.09048088e-02,\n", " -1.46982130e-02, -1.24979429e-01, -4.20956686e-02, -8.43289569e-02,\n", " -6.92764968e-02, 5.16316369e-02, 2.03369856e-02, -4.73499410e-02,\n", " 9.15571675e-02, -5.96052743e-02, 1.10012911e-01, 2.55208667e-02,\n", " -8.69148783e-03, -7.76273850e-03, 4.98862900e-02, 9.31067672e-03,\n", " -3.49833667e-02, 1.33375779e-01, 8.40289332e-03, -3.45170535e-02,\n", " -3.47062238e-02, -9.73994732e-02, -2.54784450e-02, -1.39390659e-02,\n", " -3.32783237e-02, 9.36794057e-02, 3.47191617e-02, 2.80651636e-02,\n", " 6.58571906e-03, 3.73428725e-02, -3.32412347e-02, -9.73492190e-02,\n", " -7.07265735e-02, -7.01062232e-02, 3.67225669e-02, -2.62719765e-02,\n", " 5.82991205e-02, -7.42069781e-02, 1.66096780e-02, -8.83689746e-02,\n", " -1.62591994e-01, 4.79482487e-02, 5.83929494e-02, -1.04699671e-01,\n", " 3.52650951e-03, 2.50546616e-02, 3.84298228e-02, -4.36684191e-02,\n", " 5.68282753e-02, 6.57160487e-03, -3.02405991e-02, 2.51490474e-02],\n", " dtype=float32)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "w2v_vectors['кот_NOUN']" ] }, { "cell_type": "markdown", "id": "3f93b5f6", "metadata": {}, "source": [ "# GloVe" ] }, { "cell_type": "code", "execution_count": 18, "id": "be870586", "metadata": {}, "outputs": [], "source": [ "glove_model = gensim.downloader.load(\"glove-twitter-25\") # load glove vectors\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "599d6406", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[-0.96419 -0.60978 0.67449 0.35113 0.41317 -0.21241 1.3796\n", " 0.12854 0.31567 0.66325 0.3391 -0.18934 -3.325 -1.1491\n", " -0.4129 0.2195 0.8706 -0.50616 -0.12781 -0.066965 0.065761\n", " 0.43927 0.1758 -0.56058 0.13529 ]\n" ] }, { "data": { "text/plain": [ "[('dog', 0.9590820074081421),\n", " ('monkey', 0.920357882976532),\n", " ('bear', 0.9143136739730835),\n", " ('pet', 0.9108031392097473),\n", " ('girl', 0.8880629539489746),\n", " ('horse', 0.8872726559638977),\n", " ('kitty', 0.8870542049407959),\n", " ('puppy', 0.886769711971283),\n", " ('hot', 0.886525571346283),\n", " ('lady', 0.8845519423484802)]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(glove_model['cat']) # word embedding for 'cat'\n", "glove_model.most_similar(\"cat\") # show words that similar to word 'cat'" ] }, { "cell_type": "code", "execution_count": 24, "id": "2db71cfb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.60927683" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "glove_model.similarity('cat', 'bus')" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }