Вы не можете выбрать более 25 тем
Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.
64 KiB
64 KiB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipelineimport gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']
GloVe
glove_model = gensim.downloader.load("glove-twitter-25") # load glove vectorsprint(glove_model['cat']) # word embedding for 'cat'
glove_model.most_similar("cat") # show words that similar to word 'cat'[-0.96419 -0.60978 0.67449 0.35113 0.41317 -0.21241 1.3796
0.12854 0.31567 0.66325 0.3391 -0.18934 -3.325 -1.1491
-0.4129 0.2195 0.8706 -0.50616 -0.12781 -0.066965 0.065761
0.43927 0.1758 -0.56058 0.13529 ]
[('dog', 0.9590820074081421),
('monkey', 0.920357882976532),
('bear', 0.9143136739730835),
('pet', 0.9108031392097473),
('girl', 0.8880629539489746),
('horse', 0.8872726559638977),
('kitty', 0.8870542049407959),
('puppy', 0.886769711971283),
('hot', 0.886525571346283),
('lady', 0.8845519423484802)]
glove_model.similarity('cat', 'bus')0.60927683
categories = ['alt.atheism', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )Векторизуем обучающую выборку
Получаем матрицу "Документ-термин"
vectorizer = CountVectorizer(stop_words='english')train_data = vectorizer.fit_transform(twenty_train['data'])
CV_data=pd.DataFrame(train_data.toarray(), columns=vectorizer.get_feature_names_out())
print(CV_data.shape)
CV_data.head()(1657, 23297)
| 00 | 000 | 0000 | 00000 | 000000 | 000005102000 | 000062david42 | 000100255pixel | 00041032 | 0004136 | ... | zurbrin | zurich | zus | zvi | zwaartepunten | zwak | zwakke | zware | zwarte | zyxel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 23297 columns
# Создадим список слов, присутствующих в словаре.
words_vocab=CV_data.columns
print(words_vocab[0:10])Index(['00', '000', '0000', '00000', '000000', '000005102000', '000062david42',
'000100255pixel', '00041032', '0004136'],
dtype='object')
Векторизуем с помощью GloVe
Нужно для каждого документа сложить glove-вектора слов, из которых он состоит. В результате получим вектор документа как сумму векторов слов, из него состоящих
Посмотрим на примере как будет работать векторизация
text_data = ['Hello world I love python', 'This is a great computer game! 00 000 zyxel']
# Векторизуем с помощью обученного CountVectorizer
X = vectorizer.transform(text_data)
CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data| 00 | 000 | 0000 | 00000 | 000000 | 000005102000 | 000062david42 | 000100255pixel | 00041032 | 0004136 | ... | zurbrin | zurich | zus | zvi | zwaartepunten | zwak | zwakke | zware | zwarte | zyxel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 rows × 23297 columns
# Создадим датафрейм, в который будем сохранять вектор документа
glove_data=pd.DataFrame()
# Пробегаем по каждой строке (по каждому документу)
for i in range(CV_text_data.shape[0]):
# Вектор одного документа с размерностью glove-модели:
one_doc = np.zeros(25)
# Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
# Суммируем glove-вектора каждого известного слова в one_doc
for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
if word in glove_model.key_to_index.keys():
print(word, ': ', glove_model[word])
one_doc += glove_model[word]
print(text_data[i], ': ', one_doc)
glove_data=glove_data.append(pd.DataFrame([one_doc]))
print('glove_data: ', glove_data)hello : [-0.77069 0.12827 0.33137 0.0050893 -0.47605 -0.50116
1.858 1.0624 -0.56511 0.13328 -0.41918 -0.14195
-2.8555 -0.57131 -0.13418 -0.44922 0.48591 -0.6479
-0.84238 0.61669 -0.19824 -0.57967 -0.65885 0.43928
-0.50473 ]
love : [-0.62645 -0.082389 0.070538 0.5782 -0.87199 -0.14816 2.2315
0.98573 -1.3154 -0.34921 -0.8847 0.14585 -4.97 -0.73369
-0.94359 0.035859 -0.026733 -0.77538 -0.30014 0.48853 -0.16678
-0.016651 -0.53164 0.64236 -0.10922 ]
python : [-0.25645 -0.22323 0.025901 0.22901 0.49028 -0.060829 0.24563
-0.84854 1.5882 -0.7274 0.60603 0.25205 -1.8064 -0.95526
0.44867 0.013614 0.60856 0.65423 0.82506 0.99459 -0.29403
-0.27013 -0.348 -0.7293 0.2201 ]
world : [ 0.10301 0.095666 -0.14789 -0.22383 -0.14775 -0.11599 1.8513
0.24886 -0.41877 -0.20384 -0.08509 0.33246 -4.6946 0.84096
-0.46666 -0.031128 -0.19539 -0.037349 0.58949 0.13941 -0.57667
-0.44426 -0.43085 -0.52875 0.25855 ]
Hello world I love python : [ -1.55058002 -0.081683 0.27991899 0.58846928 -1.00551002
-0.82613902 6.18642995 1.44844997 -0.71108004 -1.14717001
-0.78294002 0.58841 -14.32649982 -1.41929996 -1.09575997
-0.430875 0.87234702 -0.806399 0.27203003 2.23921998
-1.23571999 -1.31071102 -1.96934 -0.17641005 -0.1353 ]
computer : [ 0.64005 -0.019514 0.70148 -0.66123 1.1723 -0.58859 0.25917
-0.81541 1.1708 1.1413 -0.15405 -0.11369 -3.8414 -0.87233
0.47489 1.1541 0.97678 1.1107 -0.14572 -0.52013 -0.52234
-0.92349 0.34651 0.061939 -0.57375 ]
game : [ 1.146 0.3291 0.26878 -1.3945 -0.30044 0.77901 1.3537
0.37393 0.50478 -0.44266 -0.048706 0.51396 -4.3136 0.39805
1.197 0.10287 -0.17618 -1.2881 -0.59801 0.26131 -1.2619
0.39202 0.59309 -0.55232 0.005087]
great : [-8.4229e-01 3.6512e-01 -3.8841e-01 -4.6118e-01 2.4301e-01 3.2412e-01
1.9009e+00 -2.2630e-01 -3.1335e-01 -1.0970e+00 -4.1494e-03 6.2074e-01
-5.0964e+00 6.7418e-01 5.0080e-01 -6.2119e-01 5.1765e-01 -4.4122e-01
-1.4364e-01 1.9130e-01 -7.4608e-01 -2.5903e-01 -7.8010e-01 1.1030e-01
-2.7928e-01]
zyxel : [ 0.79234 0.067376 -0.22639 -2.2272 0.30057 -0.85676 -1.7268
-0.78626 1.2042 -0.92348 -0.83987 -0.74233 0.29689 -1.208
0.98706 -1.1624 0.61415 -0.27825 0.27813 1.5838 -0.63593
-0.10225 1.7102 -0.95599 -1.3867 ]
This is a great computer game! 00 000 zyxel : [ 1.73610002 0.74208201 0.35545996 -4.74411008 1.41543998
-0.34222007 1.78697008 -1.45404002 2.56643 -1.32184002
-1.04677537 0.27867999 -12.95450976 -1.00809997 3.15975004
-0.52662008 1.93239999 -0.89686999 -0.60924001 1.51628
-3.16624993 -0.89275002 1.86969995 -1.33607102 -2.23464306]
glove_data: 0 1 2 3 4 5 6 7 \
0 -1.55058 -0.081683 0.279919 0.588469 -1.00551 -0.826139 6.18643 1.44845
0 1.73610 0.742082 0.355460 -4.744110 1.41544 -0.342220 1.78697 -1.45404
8 9 ... 15 16 17 18 19 \
0 -0.71108 -1.14717 ... -0.430875 0.872347 -0.806399 0.27203 2.23922
0 2.56643 -1.32184 ... -0.526620 1.932400 -0.896870 -0.60924 1.51628
20 21 22 23 24
0 -1.23572 -1.310711 -1.96934 -0.176410 -0.135300
0 -3.16625 -0.892750 1.86970 -1.336071 -2.234643
[2 rows x 25 columns]
C:\Users\Андрей\AppData\Local\Temp\ipykernel_29476\129113310.py:17: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
glove_data=glove_data.append(pd.DataFrame([one_doc]))
def text2vec(text_data):
# Векторизуем с помощью обученного CountVectorizer
X = vectorizer.transform(text_data)
CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data
# Создадим датафрейм, в который будем сохранять вектор документа
glove_data=pd.DataFrame()
# Пробегаем по каждой строке (по каждому документу)
for i in range(CV_text_data.shape[0]):
# Вектор одного документа с размерностью glove-модели:
one_doc = np.zeros(25)
# Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
# Суммируем glove-вектора каждого известного слова в one_doc
for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
if word in glove_model.key_to_index.keys():
#print(word, ': ', glove_model[word])
one_doc += glove_model[word]
#print(text_data[i], ': ', one_doc)
glove_data = pd.concat([glove_data, pd.DataFrame([one_doc])], axis = 0)
#print('glove_data: ', glove_data)
return glove_data
glove_data| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.55058 | -0.081683 | 0.279919 | 0.588469 | -1.00551 | -0.826139 | 6.18643 | 1.44845 | -0.71108 | -1.14717 | ... | -0.430875 | 0.872347 | -0.806399 | 0.27203 | 2.23922 | -1.23572 | -1.310711 | -1.96934 | -0.176410 | -0.135300 |
| 0 | 1.73610 | 0.742082 | 0.355460 | -4.744110 | 1.41544 | -0.342220 | 1.78697 | -1.45404 | 2.56643 | -1.32184 | ... | -0.526620 | 1.932400 | -0.896870 | -0.60924 | 1.51628 | -3.16625 | -0.892750 | 1.86970 | -1.336071 | -2.234643 |
2 rows × 25 columns
one_docarray([ 1.73610002, 0.74208201, 0.35545996, -4.74411008,
1.41543998, -0.34222007, 1.78697008, -1.45404002,
2.56643 , -1.32184002, -1.04677537, 0.27867999,
-12.95450976, -1.00809997, 3.15975004, -0.52662008,
1.93239999, -0.89686999, -0.60924001, 1.51628 ,
-3.16624993, -0.89275002, 1.86969995, -1.33607102,
-2.23464306])
train_data_glove = text2vec(twenty_train['data']);
train_data_glove| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -8.521142 | 2.020376 | -10.802921 | 3.167636 | 0.252469 | 15.544048 | 17.631184 | -32.581192 | 9.696540 | -11.103087 | ... | 2.810453 | 7.900215 | 0.962129 | 17.691130 | -1.252574 | -10.098049 | 0.500113 | 1.348694 | 2.186150 | -16.556824 |
| 0 | 6.576228 | 20.336350 | -32.675150 | -9.073872 | 17.515655 | -6.488794 | 59.458419 | -75.384298 | 13.323775 | -14.443218 | ... | 21.407738 | 23.525118 | 0.325680 | 19.871444 | -27.585188 | -4.559155 | -7.417482 | -16.694553 | -0.197711 | -58.948193 |
| 0 | 1.329914 | 3.060870 | -1.868484 | 1.392735 | -1.335277 | -5.014955 | 12.859476 | -9.978156 | -0.869613 | -2.031490 | ... | 2.925134 | 2.872930 | 2.184486 | 3.831770 | -0.877866 | -0.927770 | 0.700101 | -9.855365 | -5.419429 | -2.279330 |
| 0 | -4.866150 | -0.273176 | 3.515124 | -5.008165 | -1.236789 | -7.951168 | -11.015882 | -3.496241 | 16.024286 | -9.388742 | ... | -0.471141 | 3.575378 | 6.193222 | 0.349430 | 15.040248 | -10.369132 | -0.848717 | -0.564796 | -1.114126 | -7.844431 |
| 0 | -3.115007 | -1.805252 | -5.419340 | -0.393406 | -0.406461 | -2.724340 | 7.898330 | -15.619113 | 0.231822 | -3.628156 | ... | 5.944151 | 8.309932 | -0.656084 | 12.178709 | -6.118551 | -3.286376 | 3.450946 | 2.055343 | 0.463787 | -12.644626 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | -0.930954 | 4.974043 | -8.147008 | -5.147130 | 3.960455 | -1.344022 | 7.818063 | -25.427420 | 4.624732 | -7.218097 | ... | 3.623038 | 4.453189 | 2.405320 | 8.032963 | -8.029539 | 0.838867 | -4.757457 | -5.755052 | -9.496197 | -21.542710 |
| 0 | -0.770690 | 0.128270 | 0.331370 | 0.005089 | -0.476050 | -0.501160 | 1.858000 | 1.062400 | -0.565110 | 0.133280 | ... | -0.449220 | 0.485910 | -0.647900 | -0.842380 | 0.616690 | -0.198240 | -0.579670 | -0.658850 | 0.439280 | -0.504730 |
| 0 | 1.491177 | 6.992638 | -7.921970 | -7.157521 | 6.641657 | -2.958020 | 12.820770 | -18.502946 | 6.838083 | -2.717310 | ... | -1.344873 | 4.170405 | -0.178030 | 5.699992 | -7.295038 | -3.683306 | -2.718006 | -0.117608 | -7.205832 | -13.863438 |
| 0 | 2.523770 | 5.817394 | 2.184340 | -2.996497 | -0.267181 | -10.059634 | 6.344402 | -2.047127 | 2.679123 | -7.642505 | ... | -1.230296 | 1.409746 | -3.322040 | -5.068259 | -0.648718 | 0.753010 | -6.220990 | -5.012004 | -1.518542 | -10.156440 |
| 0 | -0.118691 | 11.860546 | -2.567264 | -10.955913 | -4.239322 | -9.340552 | 21.189778 | -10.895375 | 2.659030 | -3.848115 | ... | 0.726191 | 11.634998 | -5.447248 | 1.293007 | -7.882002 | -2.527453 | 0.298939 | -6.107062 | 3.365051 | -15.641826 |
1657 rows × 25 columns
train_data<1657x23297 sparse matrix of type '<class 'numpy.int64'>'
with 106580 stored elements in Compressed Sparse Row format>
clf = KNeighborsClassifier(n_neighbors = 5)clf.fit(train_data_glove, twenty_train['target'])KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
test_data_glove = text2vec(twenty_test['data']);test_data_glove| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -6.760635 | 5.063863 | -2.779060 | 3.699120 | -2.858086 | 0.135230 | 20.811229 | -19.425567 | 7.302950 | -5.826012 | ... | 3.833378 | 6.794452 | -0.921720 | 12.187404 | -5.547615 | -4.133999 | 3.588260 | -0.497106 | -2.542142 | -11.362855 |
| 0 | 1.632616 | 2.512300 | -0.745513 | -3.081154 | 2.182067 | -1.988816 | 7.533100 | -1.015740 | -0.829598 | -2.764237 | ... | 0.791851 | 2.114150 | -2.249193 | -0.163590 | -1.177710 | -2.496928 | -5.074085 | -2.666947 | 0.662050 | -3.590550 |
| 0 | 2.115766 | 2.142060 | -0.445607 | -3.229030 | 1.154580 | -2.877278 | 6.399954 | -10.445769 | 2.230760 | -3.299899 | ... | 4.388870 | 8.515056 | -0.766260 | 3.549431 | -1.643443 | -0.825730 | -2.968016 | -0.808924 | -0.000160 | -7.468189 |
| 0 | -0.802784 | 5.199443 | 4.294071 | -7.390966 | 2.747166 | -1.359952 | 15.032628 | -1.601590 | 1.474406 | 2.570105 | ... | 3.043432 | 6.176236 | -6.193988 | -3.990476 | -2.345854 | -5.534376 | -8.925422 | 1.553300 | 0.905790 | -12.824533 |
| 0 | 29.926489 | 65.324993 | -25.059592 | -64.080130 | 77.565282 | -34.614604 | 75.643770 | -115.600859 | 90.847175 | -42.971146 | ... | 40.956031 | 50.322156 | -19.537098 | 28.903925 | -34.643949 | -69.894146 | -94.992145 | -48.601895 | -29.098555 | -91.934770 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | 1.829235 | 4.513807 | 2.916520 | 2.237308 | -1.704831 | -1.811192 | 22.196895 | -12.858912 | -4.054810 | -3.130457 | ... | 6.019246 | 8.949456 | -4.682214 | -5.648911 | -1.026898 | 3.719006 | 2.449941 | -6.487197 | 1.340930 | -7.325196 |
| 0 | -0.963815 | 5.491164 | 3.567377 | -6.048021 | -5.059298 | -0.977958 | 15.131499 | -0.904470 | 2.185990 | -1.459807 | ... | 0.968499 | 4.725793 | -0.726944 | 1.328612 | -3.144209 | 1.643127 | -1.259245 | -0.880740 | -6.713165 | -3.115454 |
| 0 | 6.801324 | 15.348126 | -17.051718 | 5.030998 | 9.332448 | -5.716691 | 56.409175 | -56.250411 | -4.028209 | -11.687558 | ... | 22.884424 | 12.940570 | 1.058664 | 21.879058 | -20.897253 | 2.537755 | 3.774890 | -11.495336 | -2.609774 | -36.597559 |
| 0 | 1.054090 | 0.764524 | 1.958340 | -1.085245 | -0.441392 | -0.421970 | 6.139770 | -0.612219 | -2.251460 | -0.465165 | ... | 0.377958 | 1.957450 | -1.705220 | -0.509700 | 0.016110 | 1.461620 | 1.589069 | 2.267340 | 0.447919 | -0.469250 |
| 0 | -18.387286 | 13.274879 | -7.895913 | -1.831442 | -10.424961 | -12.248442 | 32.153890 | -40.169293 | 13.089525 | -21.306493 | ... | 6.497279 | 8.340729 | 4.996109 | 23.442078 | -3.701088 | -11.671505 | 9.209790 | -10.002501 | -0.815266 | -17.024052 |
1102 rows × 25 columns
predict = clf.predict(test_data_glove )print (confusion_matrix(twenty_test['target'], predict))
print(classification_report(twenty_test['target'], predict))[[225 35 59]
[ 26 313 50]
[ 56 98 240]]
precision recall f1-score support
0 0.73 0.71 0.72 319
1 0.70 0.80 0.75 389
2 0.69 0.61 0.65 394
accuracy 0.71 1102
macro avg 0.71 0.71 0.70 1102
weighted avg 0.71 0.71 0.70 1102