Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

16 KiB

!pip install gensim
Requirement already satisfied: gensim in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (4.3.0)

[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip

Requirement already satisfied: scipy>=1.7.0 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from gensim) (1.10.0)
Requirement already satisfied: numpy>=1.18.5 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from gensim) (1.23.3)
Requirement already satisfied: smart-open>=1.8.1 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from gensim) (6.3.0)
Requirement already satisfied: Cython==0.29.32 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from gensim) (0.29.32)
Requirement already satisfied: FuzzyTM>=0.4.0 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from gensim) (2.0.5)
Requirement already satisfied: pyfume in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from FuzzyTM>=0.4.0->gensim) (0.2.25)
Requirement already satisfied: pandas in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from FuzzyTM>=0.4.0->gensim) (1.5.3)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from pandas->FuzzyTM>=0.4.0->gensim) (2022.7.1)
Requirement already satisfied: fst-pso in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (1.8.1)
Requirement already satisfied: simpful in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from pyfume->FuzzyTM>=0.4.0->gensim) (2.10.0)
Requirement already satisfied: six>=1.5 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from python-dateutil>=2.8.1->pandas->FuzzyTM>=0.4.0->gensim) (1.16.0)
Requirement already satisfied: miniful in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from fst-pso->pyfume->FuzzyTM>=0.4.0->gensim) (0.0.6)
Requirement already satisfied: requests in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.28.1)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (1.26.12)
Requirement already satisfied: idna<4,>=2.5 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (3.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2022.9.14)
Requirement already satisfied: charset-normalizer<3,>=2 in c:\users\андрей\appdata\local\programs\python\python39\lib\site-packages (from requests->simpful->pyfume->FuzzyTM>=0.4.0->gensim) (2.1.1)
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']

Word2Vec

w2v_vectors = gensim.downloader.load('word2vec-ruscorpora-300')
list(w2v_vectors.key_to_index.keys())[:10]
['весь_DET',
 'человек_NOUN',
 'мочь_VERB',
 'год_NOUN',
 'сказать_VERB',
 'время_NOUN',
 'говорить_VERB',
 'становиться_VERB',
 'знать_VERB',
 'самый_DET']
w2v_vectors['кот_NOUN']
array([-1.52633622e-01, -6.22178875e-02,  8.02985206e-02,  3.76203880e-02,
       -8.09977110e-03, -6.56392053e-02,  5.08799739e-02,  6.61313012e-02,
        1.57197528e-02,  9.78986733e-03,  4.73552682e-02, -2.55929027e-02,
        1.05717339e-01, -2.22761724e-02,  5.41505031e-02, -3.82993110e-02,
        5.26556484e-02,  1.17264939e-02, -1.46230776e-02, -1.95544884e-02,
        5.04205190e-02, -3.71097960e-02,  3.57442684e-02,  4.96631972e-02,
       -5.57994805e-02, -3.11674438e-02, -2.20739599e-02,  1.07113965e-01,
       -9.91705209e-02, -4.57583293e-02, -9.96095166e-02,  4.80931476e-02,
       -1.33494905e-03,  3.51430699e-02,  2.42795311e-02, -2.34595835e-02,
        5.17160492e-03, -2.06816625e-02,  4.38127927e-02, -3.31711844e-02,
       -2.07874626e-02,  6.72167316e-02, -7.74500072e-02,  2.93545369e-02,
       -1.46178985e-02,  4.10723649e-02,  8.69638026e-02, -3.46537703e-03,
        3.90354246e-02, -2.03978154e-03,  5.43198660e-02,  7.57279024e-02,
        1.48434611e-02,  8.33871886e-02, -2.87217349e-02, -3.09202913e-03,
       -7.93954656e-02, -2.82405037e-02, -1.64566293e-01, -1.17127458e-02,
       -2.68191863e-02, -1.14840917e-01,  4.07641158e-02, -1.52551448e-02,
        1.05389841e-01, -2.80199181e-02, -1.25609236e-02,  1.09363765e-01,
       -1.05669824e-02,  1.92236323e-02, -2.05025654e-02,  3.25121842e-02,
        3.57208811e-02, -2.52568591e-02,  2.24481337e-02,  5.09182140e-02,
        6.63011149e-02, -6.96184263e-02, -5.87991485e-03,  3.19263488e-02,
        2.67947633e-02,  5.35315834e-02,  5.44695035e-02,  2.58983169e-02,
       -7.08631724e-02,  1.04762614e-01, -6.68804273e-02, -1.38250962e-02,
        1.44148827e-01,  6.52979612e-02,  1.60416458e-02, -2.04468183e-02,
        3.70856933e-02, -3.04988828e-02,  1.09351687e-01,  1.64980050e-02,
        2.36458685e-02, -1.01091415e-02, -6.50116727e-02, -1.13031827e-01,
       -1.19736008e-01, -5.59152151e-03,  1.64195765e-02,  8.24512169e-03,
       -8.84061214e-03,  7.30062574e-02,  2.95458623e-04,  3.91627736e-02,
        6.22012243e-02,  1.01540620e-02, -2.01074360e-03,  9.14960168e-03,
       -2.40149889e-02, -7.16753602e-02, -8.49208906e-02,  5.45662642e-02,
        2.19109673e-02,  9.25432891e-03,  2.24880818e-02, -3.62291490e-03,
        8.57939944e-02, -5.56841269e-02, -1.16740711e-01,  2.33066957e-02,
       -8.18690881e-02, -1.44955916e-02,  3.33725065e-02,  3.03953364e-02,
        2.25391071e-02, -3.46978344e-02, -6.41057938e-02,  7.33885840e-02,
       -2.90144072e-03, -2.75960714e-02, -2.21674796e-02, -3.96765396e-02,
       -3.22195105e-02,  4.82296161e-02,  4.16103862e-02,  3.63796987e-02,
        2.58319732e-02,  7.23602101e-02,  1.09503092e-03,  8.37009493e-03,
        5.09082936e-02, -3.29718776e-02, -5.68303093e-02,  1.01079745e-02,
       -8.52582380e-02,  1.99150909e-02,  2.33987775e-02, -3.49289179e-02,
       -2.18948033e-02, -1.17089637e-02,  1.78485103e-02, -5.88125037e-03,
        2.24573947e-02, -7.76379481e-02, -2.46963687e-02,  2.34957393e-02,
       -7.47927353e-02, -3.52633633e-02,  6.65142164e-02, -2.21630055e-02,
        9.85186771e-02, -4.27325964e-02,  2.38673016e-02,  3.69326621e-02,
        5.19271940e-03, -4.75301892e-02, -1.99485421e-02,  2.70965626e-03,
       -7.23582553e-03,  8.48396868e-02,  6.64435774e-02, -9.35326666e-02,
        4.94468771e-02,  8.26572999e-02, -1.33822160e-02, -5.32249734e-03,
        4.29970361e-02,  8.93590376e-02, -1.27462680e-02,  2.74799261e-02,
       -3.33027355e-02,  4.35785688e-02,  4.56295535e-02,  3.17847766e-02,
       -9.68080908e-02, -6.77153543e-02, -9.52497870e-02, -8.87092575e-03,
       -4.08960059e-02, -5.09431772e-02,  2.54585221e-02,  5.80319017e-02,
        5.08921407e-02, -5.23761436e-02, -2.77449843e-02,  7.23702163e-02,
       -9.36738960e-03,  8.10077041e-03,  3.52279693e-02, -1.19305283e-01,
       -3.82529870e-02, -8.29238147e-02, -8.81364495e-02,  1.62167493e-02,
        2.68793292e-02, -3.83929200e-02, -2.57957950e-02, -1.86822563e-02,
       -5.47099225e-02, -5.65230772e-02, -1.98926777e-02,  3.54687981e-02,
        1.35690883e-01,  8.04331973e-02,  1.92622133e-02,  5.81734739e-02,
       -5.02377190e-02,  2.47635460e-03, -5.33336513e-02,  4.08107415e-02,
        1.18754342e-01, -7.40583912e-02,  7.48252273e-02,  1.46314219e-01,
        6.73391623e-03, -1.98812839e-02, -2.93681423e-05, -2.12224070e-02,
        1.70804688e-03,  3.52822542e-02, -1.65668026e-01, -4.84176865e-03,
        1.21439025e-02,  8.64505395e-02, -1.57235548e-01,  7.75721148e-02,
        5.35202436e-02,  1.17224073e-02, -7.53299072e-02, -3.44986990e-02,
       -1.58868451e-02,  7.00481758e-02,  7.96044394e-02, -4.09048088e-02,
       -1.46982130e-02, -1.24979429e-01, -4.20956686e-02, -8.43289569e-02,
       -6.92764968e-02,  5.16316369e-02,  2.03369856e-02, -4.73499410e-02,
        9.15571675e-02, -5.96052743e-02,  1.10012911e-01,  2.55208667e-02,
       -8.69148783e-03, -7.76273850e-03,  4.98862900e-02,  9.31067672e-03,
       -3.49833667e-02,  1.33375779e-01,  8.40289332e-03, -3.45170535e-02,
       -3.47062238e-02, -9.73994732e-02, -2.54784450e-02, -1.39390659e-02,
       -3.32783237e-02,  9.36794057e-02,  3.47191617e-02,  2.80651636e-02,
        6.58571906e-03,  3.73428725e-02, -3.32412347e-02, -9.73492190e-02,
       -7.07265735e-02, -7.01062232e-02,  3.67225669e-02, -2.62719765e-02,
        5.82991205e-02, -7.42069781e-02,  1.66096780e-02, -8.83689746e-02,
       -1.62591994e-01,  4.79482487e-02,  5.83929494e-02, -1.04699671e-01,
        3.52650951e-03,  2.50546616e-02,  3.84298228e-02, -4.36684191e-02,
        5.68282753e-02,  6.57160487e-03, -3.02405991e-02,  2.51490474e-02],
      dtype=float32)
w2v_vectors.most_similar('кот_NOUN')
[('кошка_NOUN', 0.7570087909698486),
 ('котенок_NOUN', 0.6676193475723267),
 ('пес_NOUN', 0.5633267164230347),
 ('мяукать_VERB', 0.561974287033081),
 ('тобик_NOUN', 0.5586473941802979),
 ('фоксик_NOUN', 0.5572988986968994),
 ('собака_NOUN', 0.5567899942398071),
 ('мяучать_VERB', 0.5535756349563599),
 ('харлашка_NOUN', 0.551755428314209),
 ('котяра_NOUN', 0.5508568286895752)]

GloVe

glove_model = gensim.downloader.load("glove-twitter-25")  # load glove vectors
print(glove_model['cat']) # word embedding for 'cat'
glove_model.most_similar("cat")  # show words that similar to word 'cat'
[-0.96419  -0.60978   0.67449   0.35113   0.41317  -0.21241   1.3796
  0.12854   0.31567   0.66325   0.3391   -0.18934  -3.325    -1.1491
 -0.4129    0.2195    0.8706   -0.50616  -0.12781  -0.066965  0.065761
  0.43927   0.1758   -0.56058   0.13529 ]
[('dog', 0.9590820074081421),
 ('monkey', 0.920357882976532),
 ('bear', 0.9143136739730835),
 ('pet', 0.9108031392097473),
 ('girl', 0.8880629539489746),
 ('horse', 0.8872726559638977),
 ('kitty', 0.8870542049407959),
 ('puppy', 0.886769711971283),
 ('hot', 0.886525571346283),
 ('lady', 0.8845519423484802)]
glove_model.similarity('cat', 'bus')
0.60927683