Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
62 KiB
62 KiB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipeline
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))
['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']
GloVe
= gensim.downloader.load("glove-twitter-25") # load glove vectors glove_model
print(glove_model['cat']) # word embedding for 'cat'
"cat") # show words that similar to word 'cat' glove_model.most_similar(
[-0.96419 -0.60978 0.67449 0.35113 0.41317 -0.21241 1.3796
0.12854 0.31567 0.66325 0.3391 -0.18934 -3.325 -1.1491
-0.4129 0.2195 0.8706 -0.50616 -0.12781 -0.066965 0.065761
0.43927 0.1758 -0.56058 0.13529 ]
[('dog', 0.9590820074081421),
('monkey', 0.920357882976532),
('bear', 0.9143136739730835),
('pet', 0.9108031392097473),
('girl', 0.8880629539489746),
('horse', 0.8872726559638977),
('kitty', 0.8870542049407959),
('puppy', 0.886769711971283),
('hot', 0.886525571346283),
('lady', 0.8845519423484802)]
'cat', 'bus') glove_model.similarity(
0.60927683
= ['alt.atheism', 'comp.graphics', 'sci.space']
categories = ('headers', 'footers', 'quotes')
remove = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
twenty_train = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove ) twenty_test
Векторизуем обучающую выборку
Получаем матрицу "Документ-термин"
= CountVectorizer(stop_words='english') vectorizer
= vectorizer.fit_transform(twenty_train['data'])
train_data =pd.DataFrame(train_data.toarray(), columns=vectorizer.get_feature_names_out())
CV_dataprint(CV_data.shape)
CV_data.head()
(1657, 23297)
00 | 000 | 0000 | 00000 | 000000 | 000005102000 | 000062david42 | 000100255pixel | 00041032 | 0004136 | ... | zurbrin | zurich | zus | zvi | zwaartepunten | zwak | zwakke | zware | zwarte | zyxel | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 23297 columns
# Создадим список слов, присутствующих в словаре.
=CV_data.columns
words_vocabprint(words_vocab[0:10])
Index(['00', '000', '0000', '00000', '000000', '000005102000', '000062david42',
'000100255pixel', '00041032', '0004136'],
dtype='object')
Векторизуем с помощью GloVe
Нужно для каждого документа сложить glove-вектора слов, из которых он состоит. В результате получим вектор документа как сумму векторов слов, из него состоящих
Посмотрим на примере как будет работать векторизация
= ['Hello world I love python', 'This is a great computer game! 00 000 zyxel']
text_data # Векторизуем с помощью обученного CountVectorizer
= vectorizer.transform(text_data)
X =pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data CV_text_data
00 | 000 | 0000 | 00000 | 000000 | 000005102000 | 000062david42 | 000100255pixel | 00041032 | 0004136 | ... | zurbrin | zurich | zus | zvi | zwaartepunten | zwak | zwakke | zware | zwarte | zyxel | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 rows × 23297 columns
# Создадим датафрейм, в который будем сохранять вектор документа
=pd.DataFrame()
glove_data
# Пробегаем по каждой строке датафрейма (по каждому документу)
for i in range(CV_text_data.shape[0]):
# Вектор одного документа с размерностью glove-модели:
= np.zeros(25)
one_doc
# Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
# Суммируем glove-вектора каждого известного слова в one_doc
for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
if word in glove_model.key_to_index.keys():
print(word, ': ', glove_model[word])
+= glove_model[word]
one_doc print(text_data[i], ': ', one_doc)
=glove_data.append(pd.DataFrame([one_doc])) glove_data
hello : [-0.77069 0.12827 0.33137 0.0050893 -0.47605 -0.50116
1.858 1.0624 -0.56511 0.13328 -0.41918 -0.14195
-2.8555 -0.57131 -0.13418 -0.44922 0.48591 -0.6479
-0.84238 0.61669 -0.19824 -0.57967 -0.65885 0.43928
-0.50473 ]
love : [-0.62645 -0.082389 0.070538 0.5782 -0.87199 -0.14816 2.2315
0.98573 -1.3154 -0.34921 -0.8847 0.14585 -4.97 -0.73369
-0.94359 0.035859 -0.026733 -0.77538 -0.30014 0.48853 -0.16678
-0.016651 -0.53164 0.64236 -0.10922 ]
python : [-0.25645 -0.22323 0.025901 0.22901 0.49028 -0.060829 0.24563
-0.84854 1.5882 -0.7274 0.60603 0.25205 -1.8064 -0.95526
0.44867 0.013614 0.60856 0.65423 0.82506 0.99459 -0.29403
-0.27013 -0.348 -0.7293 0.2201 ]
world : [ 0.10301 0.095666 -0.14789 -0.22383 -0.14775 -0.11599 1.8513
0.24886 -0.41877 -0.20384 -0.08509 0.33246 -4.6946 0.84096
-0.46666 -0.031128 -0.19539 -0.037349 0.58949 0.13941 -0.57667
-0.44426 -0.43085 -0.52875 0.25855 ]
Hello world I love python : [ -1.55058002 -0.081683 0.27991899 0.58846928 -1.00551002
-0.82613902 6.18642995 1.44844997 -0.71108004 -1.14717001
-0.78294002 0.58841 -14.32649982 -1.41929996 -1.09575997
-0.430875 0.87234702 -0.806399 0.27203003 2.23921998
-1.23571999 -1.31071102 -1.96934 -0.17641005 -0.1353 ]
computer : [ 0.64005 -0.019514 0.70148 -0.66123 1.1723 -0.58859 0.25917
-0.81541 1.1708 1.1413 -0.15405 -0.11369 -3.8414 -0.87233
0.47489 1.1541 0.97678 1.1107 -0.14572 -0.52013 -0.52234
-0.92349 0.34651 0.061939 -0.57375 ]
game : [ 1.146 0.3291 0.26878 -1.3945 -0.30044 0.77901 1.3537
0.37393 0.50478 -0.44266 -0.048706 0.51396 -4.3136 0.39805
1.197 0.10287 -0.17618 -1.2881 -0.59801 0.26131 -1.2619
0.39202 0.59309 -0.55232 0.005087]
great : [-8.4229e-01 3.6512e-01 -3.8841e-01 -4.6118e-01 2.4301e-01 3.2412e-01
1.9009e+00 -2.2630e-01 -3.1335e-01 -1.0970e+00 -4.1494e-03 6.2074e-01
-5.0964e+00 6.7418e-01 5.0080e-01 -6.2119e-01 5.1765e-01 -4.4122e-01
-1.4364e-01 1.9130e-01 -7.4608e-01 -2.5903e-01 -7.8010e-01 1.1030e-01
-2.7928e-01]
zyxel : [ 0.79234 0.067376 -0.22639 -2.2272 0.30057 -0.85676 -1.7268
-0.78626 1.2042 -0.92348 -0.83987 -0.74233 0.29689 -1.208
0.98706 -1.1624 0.61415 -0.27825 0.27813 1.5838 -0.63593
-0.10225 1.7102 -0.95599 -1.3867 ]
This is a great computer game! 00 000 zyxel : [ 1.73610002 0.74208201 0.35545996 -4.74411008 1.41543998
-0.34222007 1.78697008 -1.45404002 2.56643 -1.32184002
-1.04677537 0.27867999 -12.95450976 -1.00809997 3.15975004
-0.52662008 1.93239999 -0.89686999 -0.60924001 1.51628
-3.16624993 -0.89275002 1.86969995 -1.33607102 -2.23464306]
C:\Users\Андрей\AppData\Local\Temp\ipykernel_8524\2010506005.py:17: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
glove_data=glove_data.append(pd.DataFrame([one_doc]))
def text2vec(text_data):
# Векторизуем с помощью обученного CountVectorizer
= vectorizer.transform(text_data)
X =pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data
CV_text_data# Создадим датафрейм, в который будем сохранять вектор документа
=pd.DataFrame()
glove_data
# Пробегаем по каждой строке (по каждому документу)
for i in range(CV_text_data.shape[0]):
# Вектор одного документа с размерностью glove-модели:
= np.zeros(25)
one_doc
# Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
# Суммируем glove-вектора каждого известного слова в one_doc
for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
if word in glove_model.key_to_index.keys():
#print(word, ': ', glove_model[word])
+= glove_model[word]
one_doc #print(text_data[i], ': ', one_doc)
= pd.concat([glove_data, pd.DataFrame([one_doc])], axis = 0)
glove_data #print('glove_data: ', glove_data)
return glove_data
glove_data
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -1.55058 | -0.081683 | 0.279919 | 0.588469 | -1.00551 | -0.826139 | 6.18643 | 1.44845 | -0.71108 | -1.14717 | ... | -0.430875 | 0.872347 | -0.806399 | 0.27203 | 2.23922 | -1.23572 | -1.310711 | -1.96934 | -0.176410 | -0.135300 |
0 | 1.73610 | 0.742082 | 0.355460 | -4.744110 | 1.41544 | -0.342220 | 1.78697 | -1.45404 | 2.56643 | -1.32184 | ... | -0.526620 | 1.932400 | -0.896870 | -0.60924 | 1.51628 | -3.16625 | -0.892750 | 1.86970 | -1.336071 | -2.234643 |
2 rows × 25 columns
= text2vec(twenty_train['data']);
train_data_glove train_data_glove
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -8.521142 | 2.020376 | -10.802921 | 3.167636 | 0.252469 | 15.544048 | 17.631184 | -32.581192 | 9.696540 | -11.103087 | ... | 2.810453 | 7.900215 | 0.962129 | 17.691130 | -1.252574 | -10.098049 | 0.500113 | 1.348694 | 2.186150 | -16.556824 |
0 | 6.576228 | 20.336350 | -32.675150 | -9.073872 | 17.515655 | -6.488794 | 59.458419 | -75.384298 | 13.323775 | -14.443218 | ... | 21.407738 | 23.525118 | 0.325680 | 19.871444 | -27.585188 | -4.559155 | -7.417482 | -16.694553 | -0.197711 | -58.948193 |
0 | 1.329914 | 3.060870 | -1.868484 | 1.392735 | -1.335277 | -5.014955 | 12.859476 | -9.978156 | -0.869613 | -2.031490 | ... | 2.925134 | 2.872930 | 2.184486 | 3.831770 | -0.877866 | -0.927770 | 0.700101 | -9.855365 | -5.419429 | -2.279330 |
0 | -4.866150 | -0.273176 | 3.515124 | -5.008165 | -1.236789 | -7.951168 | -11.015882 | -3.496241 | 16.024286 | -9.388742 | ... | -0.471141 | 3.575378 | 6.193222 | 0.349430 | 15.040248 | -10.369132 | -0.848717 | -0.564796 | -1.114126 | -7.844431 |
0 | -3.115007 | -1.805252 | -5.419340 | -0.393406 | -0.406461 | -2.724340 | 7.898330 | -15.619113 | 0.231822 | -3.628156 | ... | 5.944151 | 8.309932 | -0.656084 | 12.178709 | -6.118551 | -3.286376 | 3.450946 | 2.055343 | 0.463787 | -12.644626 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
0 | -0.930954 | 4.974043 | -8.147008 | -5.147130 | 3.960455 | -1.344022 | 7.818063 | -25.427420 | 4.624732 | -7.218097 | ... | 3.623038 | 4.453189 | 2.405320 | 8.032963 | -8.029539 | 0.838867 | -4.757457 | -5.755052 | -9.496197 | -21.542710 |
0 | -0.770690 | 0.128270 | 0.331370 | 0.005089 | -0.476050 | -0.501160 | 1.858000 | 1.062400 | -0.565110 | 0.133280 | ... | -0.449220 | 0.485910 | -0.647900 | -0.842380 | 0.616690 | -0.198240 | -0.579670 | -0.658850 | 0.439280 | -0.504730 |
0 | 1.491177 | 6.992638 | -7.921970 | -7.157521 | 6.641657 | -2.958020 | 12.820770 | -18.502946 | 6.838083 | -2.717310 | ... | -1.344873 | 4.170405 | -0.178030 | 5.699992 | -7.295038 | -3.683306 | -2.718006 | -0.117608 | -7.205832 | -13.863438 |
0 | 2.523770 | 5.817394 | 2.184340 | -2.996497 | -0.267181 | -10.059634 | 6.344402 | -2.047127 | 2.679123 | -7.642505 | ... | -1.230296 | 1.409746 | -3.322040 | -5.068259 | -0.648718 | 0.753010 | -6.220990 | -5.012004 | -1.518542 | -10.156440 |
0 | -0.118691 | 11.860546 | -2.567264 | -10.955913 | -4.239322 | -9.340552 | 21.189778 | -10.895375 | 2.659030 | -3.848115 | ... | 0.726191 | 11.634998 | -5.447248 | 1.293007 | -7.882002 | -2.527453 | 0.298939 | -6.107062 | 3.365051 | -15.641826 |
1657 rows × 25 columns
= KNeighborsClassifier(n_neighbors = 5) clf
'target']) clf.fit(train_data_glove, twenty_train[
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
= text2vec(twenty_test['data']); test_data_glove
test_data_glove
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | -6.760635 | 5.063863 | -2.779060 | 3.699120 | -2.858086 | 0.135230 | 20.811229 | -19.425567 | 7.302950 | -5.826012 | ... | 3.833378 | 6.794452 | -0.921720 | 12.187404 | -5.547615 | -4.133999 | 3.588260 | -0.497106 | -2.542142 | -11.362855 |
0 | 1.632616 | 2.512300 | -0.745513 | -3.081154 | 2.182067 | -1.988816 | 7.533100 | -1.015740 | -0.829598 | -2.764237 | ... | 0.791851 | 2.114150 | -2.249193 | -0.163590 | -1.177710 | -2.496928 | -5.074085 | -2.666947 | 0.662050 | -3.590550 |
0 | 2.115766 | 2.142060 | -0.445607 | -3.229030 | 1.154580 | -2.877278 | 6.399954 | -10.445769 | 2.230760 | -3.299899 | ... | 4.388870 | 8.515056 | -0.766260 | 3.549431 | -1.643443 | -0.825730 | -2.968016 | -0.808924 | -0.000160 | -7.468189 |
0 | -0.802784 | 5.199443 | 4.294071 | -7.390966 | 2.747166 | -1.359952 | 15.032628 | -1.601590 | 1.474406 | 2.570105 | ... | 3.043432 | 6.176236 | -6.193988 | -3.990476 | -2.345854 | -5.534376 | -8.925422 | 1.553300 | 0.905790 | -12.824533 |
0 | 29.926489 | 65.324993 | -25.059592 | -64.080130 | 77.565282 | -34.614604 | 75.643770 | -115.600859 | 90.847175 | -42.971146 | ... | 40.956031 | 50.322156 | -19.537098 | 28.903925 | -34.643949 | -69.894146 | -94.992145 | -48.601895 | -29.098555 | -91.934770 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
0 | 1.829235 | 4.513807 | 2.916520 | 2.237308 | -1.704831 | -1.811192 | 22.196895 | -12.858912 | -4.054810 | -3.130457 | ... | 6.019246 | 8.949456 | -4.682214 | -5.648911 | -1.026898 | 3.719006 | 2.449941 | -6.487197 | 1.340930 | -7.325196 |
0 | -0.963815 | 5.491164 | 3.567377 | -6.048021 | -5.059298 | -0.977958 | 15.131499 | -0.904470 | 2.185990 | -1.459807 | ... | 0.968499 | 4.725793 | -0.726944 | 1.328612 | -3.144209 | 1.643127 | -1.259245 | -0.880740 | -6.713165 | -3.115454 |
0 | 6.801324 | 15.348126 | -17.051718 | 5.030998 | 9.332448 | -5.716691 | 56.409175 | -56.250411 | -4.028209 | -11.687558 | ... | 22.884424 | 12.940570 | 1.058664 | 21.879058 | -20.897253 | 2.537755 | 3.774890 | -11.495336 | -2.609774 | -36.597559 |
0 | 1.054090 | 0.764524 | 1.958340 | -1.085245 | -0.441392 | -0.421970 | 6.139770 | -0.612219 | -2.251460 | -0.465165 | ... | 0.377958 | 1.957450 | -1.705220 | -0.509700 | 0.016110 | 1.461620 | 1.589069 | 2.267340 | 0.447919 | -0.469250 |
0 | -18.387286 | 13.274879 | -7.895913 | -1.831442 | -10.424961 | -12.248442 | 32.153890 | -40.169293 | 13.089525 | -21.306493 | ... | 6.497279 | 8.340729 | 4.996109 | 23.442078 | -3.701088 | -11.671505 | 9.209790 | -10.002501 | -0.815266 | -17.024052 |
1102 rows × 25 columns
= clf.predict(test_data_glove ) predict
print (confusion_matrix(twenty_test['target'], predict))
print(classification_report(twenty_test['target'], predict))
[[225 35 59]
[ 26 313 50]
[ 56 98 240]]
precision recall f1-score support
0 0.73 0.71 0.72 319
1 0.70 0.80 0.75 389
2 0.69 0.61 0.65 394
accuracy 0.71 1102
macro avg 0.71 0.71 0.70 1102
weighted avg 0.71 0.71 0.70 1102