Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

64 KiB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']

GloVe

glove_model = gensim.downloader.load("glove-twitter-25")  # load glove vectors
print(glove_model['cat']) # word embedding for 'cat'
glove_model.most_similar("cat")  # show words that similar to word 'cat'
[-0.96419  -0.60978   0.67449   0.35113   0.41317  -0.21241   1.3796
  0.12854   0.31567   0.66325   0.3391   -0.18934  -3.325    -1.1491
 -0.4129    0.2195    0.8706   -0.50616  -0.12781  -0.066965  0.065761
  0.43927   0.1758   -0.56058   0.13529 ]
[('dog', 0.9590820074081421),
 ('monkey', 0.920357882976532),
 ('bear', 0.9143136739730835),
 ('pet', 0.9108031392097473),
 ('girl', 0.8880629539489746),
 ('horse', 0.8872726559638977),
 ('kitty', 0.8870542049407959),
 ('puppy', 0.886769711971283),
 ('hot', 0.886525571346283),
 ('lady', 0.8845519423484802)]
glove_model.similarity('cat', 'bus')
0.60927683
categories = ['alt.atheism', 'comp.graphics', 'sci.space'] 
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )

Векторизуем обучающую выборку

Получаем матрицу "Документ-термин"

vectorizer = CountVectorizer(stop_words='english')
train_data = vectorizer.fit_transform(twenty_train['data'])
CV_data=pd.DataFrame(train_data.toarray(), columns=vectorizer.get_feature_names_out())
print(CV_data.shape)
CV_data.head()
(1657, 23297)
00 000 0000 00000 000000 000005102000 000062david42 000100255pixel 00041032 0004136 ... zurbrin zurich zus zvi zwaartepunten zwak zwakke zware zwarte zyxel
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 23297 columns

# Создадим список слов, присутствующих в словаре.
words_vocab=CV_data.columns
print(words_vocab[0:10])
Index(['00', '000', '0000', '00000', '000000', '000005102000', '000062david42',
       '000100255pixel', '00041032', '0004136'],
      dtype='object')

Векторизуем с помощью GloVe

Нужно для каждого документа сложить glove-вектора слов, из которых он состоит. В результате получим вектор документа как сумму векторов слов, из него состоящих

Посмотрим на примере как будет работать векторизация

text_data = ['Hello world I love python', 'This is a great computer game! 00 000 zyxel']
# Векторизуем с помощью обученного CountVectorizer
X = vectorizer.transform(text_data)
CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data
00 000 0000 00000 000000 000005102000 000062david42 000100255pixel 00041032 0004136 ... zurbrin zurich zus zvi zwaartepunten zwak zwakke zware zwarte zyxel
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 1 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

2 rows × 23297 columns

# Создадим датафрейм, в который будем сохранять вектор документа
glove_data=pd.DataFrame()

# Пробегаем по каждой строке (по каждому документу)
for i in range(CV_text_data.shape[0]):
    
    # Вектор одного документа с размерностью glove-модели:
    one_doc = np.zeros(25) 
    
    # Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
    # Суммируем glove-вектора каждого известного слова в one_doc
    for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
        if word in glove_model.key_to_index.keys(): 
            print(word, ': ', glove_model[word])
            one_doc += glove_model[word]
    print(text_data[i], ': ', one_doc)
    glove_data=glove_data.append(pd.DataFrame([one_doc]))    
print('glove_data: ', glove_data)
hello :  [-0.77069    0.12827    0.33137    0.0050893 -0.47605   -0.50116
  1.858      1.0624    -0.56511    0.13328   -0.41918   -0.14195
 -2.8555    -0.57131   -0.13418   -0.44922    0.48591   -0.6479
 -0.84238    0.61669   -0.19824   -0.57967   -0.65885    0.43928
 -0.50473  ]
love :  [-0.62645  -0.082389  0.070538  0.5782   -0.87199  -0.14816   2.2315
  0.98573  -1.3154   -0.34921  -0.8847    0.14585  -4.97     -0.73369
 -0.94359   0.035859 -0.026733 -0.77538  -0.30014   0.48853  -0.16678
 -0.016651 -0.53164   0.64236  -0.10922 ]
python :  [-0.25645  -0.22323   0.025901  0.22901   0.49028  -0.060829  0.24563
 -0.84854   1.5882   -0.7274    0.60603   0.25205  -1.8064   -0.95526
  0.44867   0.013614  0.60856   0.65423   0.82506   0.99459  -0.29403
 -0.27013  -0.348    -0.7293    0.2201  ]
world :  [ 0.10301   0.095666 -0.14789  -0.22383  -0.14775  -0.11599   1.8513
  0.24886  -0.41877  -0.20384  -0.08509   0.33246  -4.6946    0.84096
 -0.46666  -0.031128 -0.19539  -0.037349  0.58949   0.13941  -0.57667
 -0.44426  -0.43085  -0.52875   0.25855 ]
Hello world I love python :  [ -1.55058002  -0.081683     0.27991899   0.58846928  -1.00551002
  -0.82613902   6.18642995   1.44844997  -0.71108004  -1.14717001
  -0.78294002   0.58841    -14.32649982  -1.41929996  -1.09575997
  -0.430875     0.87234702  -0.806399     0.27203003   2.23921998
  -1.23571999  -1.31071102  -1.96934     -0.17641005  -0.1353    ]
computer :  [ 0.64005  -0.019514  0.70148  -0.66123   1.1723   -0.58859   0.25917
 -0.81541   1.1708    1.1413   -0.15405  -0.11369  -3.8414   -0.87233
  0.47489   1.1541    0.97678   1.1107   -0.14572  -0.52013  -0.52234
 -0.92349   0.34651   0.061939 -0.57375 ]
game :  [ 1.146     0.3291    0.26878  -1.3945   -0.30044   0.77901   1.3537
  0.37393   0.50478  -0.44266  -0.048706  0.51396  -4.3136    0.39805
  1.197     0.10287  -0.17618  -1.2881   -0.59801   0.26131  -1.2619
  0.39202   0.59309  -0.55232   0.005087]
great :  [-8.4229e-01  3.6512e-01 -3.8841e-01 -4.6118e-01  2.4301e-01  3.2412e-01
  1.9009e+00 -2.2630e-01 -3.1335e-01 -1.0970e+00 -4.1494e-03  6.2074e-01
 -5.0964e+00  6.7418e-01  5.0080e-01 -6.2119e-01  5.1765e-01 -4.4122e-01
 -1.4364e-01  1.9130e-01 -7.4608e-01 -2.5903e-01 -7.8010e-01  1.1030e-01
 -2.7928e-01]
zyxel :  [ 0.79234   0.067376 -0.22639  -2.2272    0.30057  -0.85676  -1.7268
 -0.78626   1.2042   -0.92348  -0.83987  -0.74233   0.29689  -1.208
  0.98706  -1.1624    0.61415  -0.27825   0.27813   1.5838   -0.63593
 -0.10225   1.7102   -0.95599  -1.3867  ]
This is a great computer game! 00 000 zyxel :  [  1.73610002   0.74208201   0.35545996  -4.74411008   1.41543998
  -0.34222007   1.78697008  -1.45404002   2.56643     -1.32184002
  -1.04677537   0.27867999 -12.95450976  -1.00809997   3.15975004
  -0.52662008   1.93239999  -0.89686999  -0.60924001   1.51628
  -3.16624993  -0.89275002   1.86969995  -1.33607102  -2.23464306]
glove_data:          0         1         2         3        4         5        6        7   \
0 -1.55058 -0.081683  0.279919  0.588469 -1.00551 -0.826139  6.18643  1.44845   
0  1.73610  0.742082  0.355460 -4.744110  1.41544 -0.342220  1.78697 -1.45404   

        8        9   ...        15        16        17       18       19  \
0 -0.71108 -1.14717  ... -0.430875  0.872347 -0.806399  0.27203  2.23922   
0  2.56643 -1.32184  ... -0.526620  1.932400 -0.896870 -0.60924  1.51628   

        20        21       22        23        24  
0 -1.23572 -1.310711 -1.96934 -0.176410 -0.135300  
0 -3.16625 -0.892750  1.86970 -1.336071 -2.234643  

[2 rows x 25 columns]
C:\Users\Андрей\AppData\Local\Temp\ipykernel_29476\129113310.py:17: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  glove_data=glove_data.append(pd.DataFrame([one_doc]))
def text2vec(text_data):
    
    # Векторизуем с помощью обученного CountVectorizer
    X = vectorizer.transform(text_data)
    CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    CV_text_data
    # Создадим датафрейм, в который будем сохранять вектор документа
    glove_data=pd.DataFrame()

    # Пробегаем по каждой строке (по каждому документу)
    for i in range(CV_text_data.shape[0]):

        # Вектор одного документа с размерностью glove-модели:
        one_doc = np.zeros(25) 

        # Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
        # Суммируем glove-вектора каждого известного слова в one_doc
        for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
            if word in glove_model.key_to_index.keys(): 
                #print(word, ': ', glove_model[word])
                one_doc += glove_model[word]
        #print(text_data[i], ': ', one_doc)
        glove_data = pd.concat([glove_data, pd.DataFrame([one_doc])], axis = 0)
    #print('glove_data: ', glove_data)
    return glove_data

glove_data
0 1 2 3 4 5 6 7 8 9 ... 15 16 17 18 19 20 21 22 23 24
0 -1.55058 -0.081683 0.279919 0.588469 -1.00551 -0.826139 6.18643 1.44845 -0.71108 -1.14717 ... -0.430875 0.872347 -0.806399 0.27203 2.23922 -1.23572 -1.310711 -1.96934 -0.176410 -0.135300
0 1.73610 0.742082 0.355460 -4.744110 1.41544 -0.342220 1.78697 -1.45404 2.56643 -1.32184 ... -0.526620 1.932400 -0.896870 -0.60924 1.51628 -3.16625 -0.892750 1.86970 -1.336071 -2.234643

2 rows × 25 columns

one_doc
array([  1.73610002,   0.74208201,   0.35545996,  -4.74411008,
         1.41543998,  -0.34222007,   1.78697008,  -1.45404002,
         2.56643   ,  -1.32184002,  -1.04677537,   0.27867999,
       -12.95450976,  -1.00809997,   3.15975004,  -0.52662008,
         1.93239999,  -0.89686999,  -0.60924001,   1.51628   ,
        -3.16624993,  -0.89275002,   1.86969995,  -1.33607102,
        -2.23464306])
train_data_glove = text2vec(twenty_train['data']);
train_data_glove
0 1 2 3 4 5 6 7 8 9 ... 15 16 17 18 19 20 21 22 23 24
0 -8.521142 2.020376 -10.802921 3.167636 0.252469 15.544048 17.631184 -32.581192 9.696540 -11.103087 ... 2.810453 7.900215 0.962129 17.691130 -1.252574 -10.098049 0.500113 1.348694 2.186150 -16.556824
0 6.576228 20.336350 -32.675150 -9.073872 17.515655 -6.488794 59.458419 -75.384298 13.323775 -14.443218 ... 21.407738 23.525118 0.325680 19.871444 -27.585188 -4.559155 -7.417482 -16.694553 -0.197711 -58.948193
0 1.329914 3.060870 -1.868484 1.392735 -1.335277 -5.014955 12.859476 -9.978156 -0.869613 -2.031490 ... 2.925134 2.872930 2.184486 3.831770 -0.877866 -0.927770 0.700101 -9.855365 -5.419429 -2.279330
0 -4.866150 -0.273176 3.515124 -5.008165 -1.236789 -7.951168 -11.015882 -3.496241 16.024286 -9.388742 ... -0.471141 3.575378 6.193222 0.349430 15.040248 -10.369132 -0.848717 -0.564796 -1.114126 -7.844431
0 -3.115007 -1.805252 -5.419340 -0.393406 -0.406461 -2.724340 7.898330 -15.619113 0.231822 -3.628156 ... 5.944151 8.309932 -0.656084 12.178709 -6.118551 -3.286376 3.450946 2.055343 0.463787 -12.644626
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
0 -0.930954 4.974043 -8.147008 -5.147130 3.960455 -1.344022 7.818063 -25.427420 4.624732 -7.218097 ... 3.623038 4.453189 2.405320 8.032963 -8.029539 0.838867 -4.757457 -5.755052 -9.496197 -21.542710
0 -0.770690 0.128270 0.331370 0.005089 -0.476050 -0.501160 1.858000 1.062400 -0.565110 0.133280 ... -0.449220 0.485910 -0.647900 -0.842380 0.616690 -0.198240 -0.579670 -0.658850 0.439280 -0.504730
0 1.491177 6.992638 -7.921970 -7.157521 6.641657 -2.958020 12.820770 -18.502946 6.838083 -2.717310 ... -1.344873 4.170405 -0.178030 5.699992 -7.295038 -3.683306 -2.718006 -0.117608 -7.205832 -13.863438
0 2.523770 5.817394 2.184340 -2.996497 -0.267181 -10.059634 6.344402 -2.047127 2.679123 -7.642505 ... -1.230296 1.409746 -3.322040 -5.068259 -0.648718 0.753010 -6.220990 -5.012004 -1.518542 -10.156440
0 -0.118691 11.860546 -2.567264 -10.955913 -4.239322 -9.340552 21.189778 -10.895375 2.659030 -3.848115 ... 0.726191 11.634998 -5.447248 1.293007 -7.882002 -2.527453 0.298939 -6.107062 3.365051 -15.641826

1657 rows × 25 columns

train_data
<1657x23297 sparse matrix of type '<class 'numpy.int64'>'
	with 106580 stored elements in Compressed Sparse Row format>
clf = KNeighborsClassifier(n_neighbors = 5)
clf.fit(train_data_glove, twenty_train['target'])
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
test_data_glove = text2vec(twenty_test['data']);
test_data_glove
0 1 2 3 4 5 6 7 8 9 ... 15 16 17 18 19 20 21 22 23 24
0 -6.760635 5.063863 -2.779060 3.699120 -2.858086 0.135230 20.811229 -19.425567 7.302950 -5.826012 ... 3.833378 6.794452 -0.921720 12.187404 -5.547615 -4.133999 3.588260 -0.497106 -2.542142 -11.362855
0 1.632616 2.512300 -0.745513 -3.081154 2.182067 -1.988816 7.533100 -1.015740 -0.829598 -2.764237 ... 0.791851 2.114150 -2.249193 -0.163590 -1.177710 -2.496928 -5.074085 -2.666947 0.662050 -3.590550
0 2.115766 2.142060 -0.445607 -3.229030 1.154580 -2.877278 6.399954 -10.445769 2.230760 -3.299899 ... 4.388870 8.515056 -0.766260 3.549431 -1.643443 -0.825730 -2.968016 -0.808924 -0.000160 -7.468189
0 -0.802784 5.199443 4.294071 -7.390966 2.747166 -1.359952 15.032628 -1.601590 1.474406 2.570105 ... 3.043432 6.176236 -6.193988 -3.990476 -2.345854 -5.534376 -8.925422 1.553300 0.905790 -12.824533
0 29.926489 65.324993 -25.059592 -64.080130 77.565282 -34.614604 75.643770 -115.600859 90.847175 -42.971146 ... 40.956031 50.322156 -19.537098 28.903925 -34.643949 -69.894146 -94.992145 -48.601895 -29.098555 -91.934770
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
0 1.829235 4.513807 2.916520 2.237308 -1.704831 -1.811192 22.196895 -12.858912 -4.054810 -3.130457 ... 6.019246 8.949456 -4.682214 -5.648911 -1.026898 3.719006 2.449941 -6.487197 1.340930 -7.325196
0 -0.963815 5.491164 3.567377 -6.048021 -5.059298 -0.977958 15.131499 -0.904470 2.185990 -1.459807 ... 0.968499 4.725793 -0.726944 1.328612 -3.144209 1.643127 -1.259245 -0.880740 -6.713165 -3.115454
0 6.801324 15.348126 -17.051718 5.030998 9.332448 -5.716691 56.409175 -56.250411 -4.028209 -11.687558 ... 22.884424 12.940570 1.058664 21.879058 -20.897253 2.537755 3.774890 -11.495336 -2.609774 -36.597559
0 1.054090 0.764524 1.958340 -1.085245 -0.441392 -0.421970 6.139770 -0.612219 -2.251460 -0.465165 ... 0.377958 1.957450 -1.705220 -0.509700 0.016110 1.461620 1.589069 2.267340 0.447919 -0.469250
0 -18.387286 13.274879 -7.895913 -1.831442 -10.424961 -12.248442 32.153890 -40.169293 13.089525 -21.306493 ... 6.497279 8.340729 4.996109 23.442078 -3.701088 -11.671505 9.209790 -10.002501 -0.815266 -17.024052

1102 rows × 25 columns

predict = clf.predict(test_data_glove )
print (confusion_matrix(twenty_test['target'], predict))
print(classification_report(twenty_test['target'], predict))
[[225  35  59]
 [ 26 313  50]
 [ 56  98 240]]
              precision    recall  f1-score   support

           0       0.73      0.71      0.72       319
           1       0.70      0.80      0.75       389
           2       0.69      0.61      0.65       394

    accuracy                           0.71      1102
   macro avg       0.71      0.71      0.70      1102
weighted avg       0.71      0.71      0.70      1102