64 KiB
64 KiB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipelineimport gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']
GloVe
glove_model = gensim.downloader.load("glove-twitter-25") # load glove vectorsprint(glove_model['cat']) # word embedding for 'cat'
glove_model.most_similar("cat") # show words that similar to word 'cat'[-0.96419 -0.60978 0.67449 0.35113 0.41317 -0.21241 1.3796
0.12854 0.31567 0.66325 0.3391 -0.18934 -3.325 -1.1491
-0.4129 0.2195 0.8706 -0.50616 -0.12781 -0.066965 0.065761
0.43927 0.1758 -0.56058 0.13529 ]
[('dog', 0.9590820074081421),
('monkey', 0.920357882976532),
('bear', 0.9143136739730835),
('pet', 0.9108031392097473),
('girl', 0.8880629539489746),
('horse', 0.8872726559638977),
('kitty', 0.8870542049407959),
('puppy', 0.886769711971283),
('hot', 0.886525571346283),
('lady', 0.8845519423484802)]
glove_model.similarity('cat', 'bus')0.60927683
categories = ['alt.atheism', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )Векторизуем обучающую выборку
Получаем матрицу "Документ-термин"
vectorizer = CountVectorizer(stop_words='english')train_data = vectorizer.fit_transform(twenty_train['data'])
CV_data=pd.DataFrame(train_data.toarray(), columns=vectorizer.get_feature_names_out())
print(CV_data.shape)
CV_data.head()(1657, 23297)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 00 | 000 | 0000 | 00000 | 000000 | 000005102000 | 000062david42 | 000100255pixel | 00041032 | 0004136 | ... | zurbrin | zurich | zus | zvi | zwaartepunten | zwak | zwakke | zware | zwarte | zyxel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 23297 columns
# Создадим список слов, присутствующих в словаре.
words_vocab=CV_data.columns
print(words_vocab[0:10])Index(['00', '000', '0000', '00000', '000000', '000005102000', '000062david42',
'000100255pixel', '00041032', '0004136'],
dtype='object')
Векторизуем с помощью GloVe
Нужно для каждого документа сложить glove-вектора слов, из которых он состоит. В результате получим вектор документа как сумму векторов слов, из него состоящих
Посмотрим на примере как будет работать векторизация
text_data = ['Hello world I love python', 'This is a great computer game! 00 000 zyxel']
# Векторизуем с помощью обученного CountVectorizer
X = vectorizer.transform(text_data)
CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 00 | 000 | 0000 | 00000 | 000000 | 000005102000 | 000062david42 | 000100255pixel | 00041032 | 0004136 | ... | zurbrin | zurich | zus | zvi | zwaartepunten | zwak | zwakke | zware | zwarte | zyxel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 rows × 23297 columns
# Создадим датафрейм, в который будем сохранять вектор документа
glove_data=pd.DataFrame()
# Пробегаем по каждой строке (по каждому документу)
for i in range(CV_text_data.shape[0]):
# Вектор одного документа с размерностью glove-модели:
one_doc = np.zeros(25)
# Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
# Суммируем glove-вектора каждого известного слова в one_doc
for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
if word in glove_model.key_to_index.keys():
print(word, ': ', glove_model[word])
one_doc += glove_model[word]
print(text_data[i], ': ', one_doc)
glove_data=glove_data.append(pd.DataFrame([one_doc]))
print('glove_data: ', glove_data)hello : [-0.77069 0.12827 0.33137 0.0050893 -0.47605 -0.50116
1.858 1.0624 -0.56511 0.13328 -0.41918 -0.14195
-2.8555 -0.57131 -0.13418 -0.44922 0.48591 -0.6479
-0.84238 0.61669 -0.19824 -0.57967 -0.65885 0.43928
-0.50473 ]
love : [-0.62645 -0.082389 0.070538 0.5782 -0.87199 -0.14816 2.2315
0.98573 -1.3154 -0.34921 -0.8847 0.14585 -4.97 -0.73369
-0.94359 0.035859 -0.026733 -0.77538 -0.30014 0.48853 -0.16678
-0.016651 -0.53164 0.64236 -0.10922 ]
python : [-0.25645 -0.22323 0.025901 0.22901 0.49028 -0.060829 0.24563
-0.84854 1.5882 -0.7274 0.60603 0.25205 -1.8064 -0.95526
0.44867 0.013614 0.60856 0.65423 0.82506 0.99459 -0.29403
-0.27013 -0.348 -0.7293 0.2201 ]
world : [ 0.10301 0.095666 -0.14789 -0.22383 -0.14775 -0.11599 1.8513
0.24886 -0.41877 -0.20384 -0.08509 0.33246 -4.6946 0.84096
-0.46666 -0.031128 -0.19539 -0.037349 0.58949 0.13941 -0.57667
-0.44426 -0.43085 -0.52875 0.25855 ]
Hello world I love python : [ -1.55058002 -0.081683 0.27991899 0.58846928 -1.00551002
-0.82613902 6.18642995 1.44844997 -0.71108004 -1.14717001
-0.78294002 0.58841 -14.32649982 -1.41929996 -1.09575997
-0.430875 0.87234702 -0.806399 0.27203003 2.23921998
-1.23571999 -1.31071102 -1.96934 -0.17641005 -0.1353 ]
computer : [ 0.64005 -0.019514 0.70148 -0.66123 1.1723 -0.58859 0.25917
-0.81541 1.1708 1.1413 -0.15405 -0.11369 -3.8414 -0.87233
0.47489 1.1541 0.97678 1.1107 -0.14572 -0.52013 -0.52234
-0.92349 0.34651 0.061939 -0.57375 ]
game : [ 1.146 0.3291 0.26878 -1.3945 -0.30044 0.77901 1.3537
0.37393 0.50478 -0.44266 -0.048706 0.51396 -4.3136 0.39805
1.197 0.10287 -0.17618 -1.2881 -0.59801 0.26131 -1.2619
0.39202 0.59309 -0.55232 0.005087]
great : [-8.4229e-01 3.6512e-01 -3.8841e-01 -4.6118e-01 2.4301e-01 3.2412e-01
1.9009e+00 -2.2630e-01 -3.1335e-01 -1.0970e+00 -4.1494e-03 6.2074e-01
-5.0964e+00 6.7418e-01 5.0080e-01 -6.2119e-01 5.1765e-01 -4.4122e-01
-1.4364e-01 1.9130e-01 -7.4608e-01 -2.5903e-01 -7.8010e-01 1.1030e-01
-2.7928e-01]
zyxel : [ 0.79234 0.067376 -0.22639 -2.2272 0.30057 -0.85676 -1.7268
-0.78626 1.2042 -0.92348 -0.83987 -0.74233 0.29689 -1.208
0.98706 -1.1624 0.61415 -0.27825 0.27813 1.5838 -0.63593
-0.10225 1.7102 -0.95599 -1.3867 ]
This is a great computer game! 00 000 zyxel : [ 1.73610002 0.74208201 0.35545996 -4.74411008 1.41543998
-0.34222007 1.78697008 -1.45404002 2.56643 -1.32184002
-1.04677537 0.27867999 -12.95450976 -1.00809997 3.15975004
-0.52662008 1.93239999 -0.89686999 -0.60924001 1.51628
-3.16624993 -0.89275002 1.86969995 -1.33607102 -2.23464306]
glove_data: 0 1 2 3 4 5 6 7 \
0 -1.55058 -0.081683 0.279919 0.588469 -1.00551 -0.826139 6.18643 1.44845
0 1.73610 0.742082 0.355460 -4.744110 1.41544 -0.342220 1.78697 -1.45404
8 9 ... 15 16 17 18 19 \
0 -0.71108 -1.14717 ... -0.430875 0.872347 -0.806399 0.27203 2.23922
0 2.56643 -1.32184 ... -0.526620 1.932400 -0.896870 -0.60924 1.51628
20 21 22 23 24
0 -1.23572 -1.310711 -1.96934 -0.176410 -0.135300
0 -3.16625 -0.892750 1.86970 -1.336071 -2.234643
[2 rows x 25 columns]
C:\Users\Андрей\AppData\Local\Temp\ipykernel_29476\129113310.py:17: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
glove_data=glove_data.append(pd.DataFrame([one_doc]))
def text2vec(text_data):
# Векторизуем с помощью обученного CountVectorizer
X = vectorizer.transform(text_data)
CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data
# Создадим датафрейм, в который будем сохранять вектор документа
glove_data=pd.DataFrame()
# Пробегаем по каждой строке (по каждому документу)
for i in range(CV_text_data.shape[0]):
# Вектор одного документа с размерностью glove-модели:
one_doc = np.zeros(25)
# Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
# Суммируем glove-вектора каждого известного слова в one_doc
for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
if word in glove_model.key_to_index.keys():
#print(word, ': ', glove_model[word])
one_doc += glove_model[word]
#print(text_data[i], ': ', one_doc)
glove_data = pd.concat([glove_data, pd.DataFrame([one_doc])], axis = 0)
#print('glove_data: ', glove_data)
return glove_data
glove_data
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.55058 | -0.081683 | 0.279919 | 0.588469 | -1.00551 | -0.826139 | 6.18643 | 1.44845 | -0.71108 | -1.14717 | ... | -0.430875 | 0.872347 | -0.806399 | 0.27203 | 2.23922 | -1.23572 | -1.310711 | -1.96934 | -0.176410 | -0.135300 |
| 0 | 1.73610 | 0.742082 | 0.355460 | -4.744110 | 1.41544 | -0.342220 | 1.78697 | -1.45404 | 2.56643 | -1.32184 | ... | -0.526620 | 1.932400 | -0.896870 | -0.60924 | 1.51628 | -3.16625 | -0.892750 | 1.86970 | -1.336071 | -2.234643 |
2 rows × 25 columns
one_docarray([ 1.73610002, 0.74208201, 0.35545996, -4.74411008,
1.41543998, -0.34222007, 1.78697008, -1.45404002,
2.56643 , -1.32184002, -1.04677537, 0.27867999,
-12.95450976, -1.00809997, 3.15975004, -0.52662008,
1.93239999, -0.89686999, -0.60924001, 1.51628 ,
-3.16624993, -0.89275002, 1.86969995, -1.33607102,
-2.23464306])
train_data_glove = text2vec(twenty_train['data']);
train_data_glove
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -8.521142 | 2.020376 | -10.802921 | 3.167636 | 0.252469 | 15.544048 | 17.631184 | -32.581192 | 9.696540 | -11.103087 | ... | 2.810453 | 7.900215 | 0.962129 | 17.691130 | -1.252574 | -10.098049 | 0.500113 | 1.348694 | 2.186150 | -16.556824 |
| 0 | 6.576228 | 20.336350 | -32.675150 | -9.073872 | 17.515655 | -6.488794 | 59.458419 | -75.384298 | 13.323775 | -14.443218 | ... | 21.407738 | 23.525118 | 0.325680 | 19.871444 | -27.585188 | -4.559155 | -7.417482 | -16.694553 | -0.197711 | -58.948193 |
| 0 | 1.329914 | 3.060870 | -1.868484 | 1.392735 | -1.335277 | -5.014955 | 12.859476 | -9.978156 | -0.869613 | -2.031490 | ... | 2.925134 | 2.872930 | 2.184486 | 3.831770 | -0.877866 | -0.927770 | 0.700101 | -9.855365 | -5.419429 | -2.279330 |
| 0 | -4.866150 | -0.273176 | 3.515124 | -5.008165 | -1.236789 | -7.951168 | -11.015882 | -3.496241 | 16.024286 | -9.388742 | ... | -0.471141 | 3.575378 | 6.193222 | 0.349430 | 15.040248 | -10.369132 | -0.848717 | -0.564796 | -1.114126 | -7.844431 |
| 0 | -3.115007 | -1.805252 | -5.419340 | -0.393406 | -0.406461 | -2.724340 | 7.898330 | -15.619113 | 0.231822 | -3.628156 | ... | 5.944151 | 8.309932 | -0.656084 | 12.178709 | -6.118551 | -3.286376 | 3.450946 | 2.055343 | 0.463787 | -12.644626 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | -0.930954 | 4.974043 | -8.147008 | -5.147130 | 3.960455 | -1.344022 | 7.818063 | -25.427420 | 4.624732 | -7.218097 | ... | 3.623038 | 4.453189 | 2.405320 | 8.032963 | -8.029539 | 0.838867 | -4.757457 | -5.755052 | -9.496197 | -21.542710 |
| 0 | -0.770690 | 0.128270 | 0.331370 | 0.005089 | -0.476050 | -0.501160 | 1.858000 | 1.062400 | -0.565110 | 0.133280 | ... | -0.449220 | 0.485910 | -0.647900 | -0.842380 | 0.616690 | -0.198240 | -0.579670 | -0.658850 | 0.439280 | -0.504730 |
| 0 | 1.491177 | 6.992638 | -7.921970 | -7.157521 | 6.641657 | -2.958020 | 12.820770 | -18.502946 | 6.838083 | -2.717310 | ... | -1.344873 | 4.170405 | -0.178030 | 5.699992 | -7.295038 | -3.683306 | -2.718006 | -0.117608 | -7.205832 | -13.863438 |
| 0 | 2.523770 | 5.817394 | 2.184340 | -2.996497 | -0.267181 | -10.059634 | 6.344402 | -2.047127 | 2.679123 | -7.642505 | ... | -1.230296 | 1.409746 | -3.322040 | -5.068259 | -0.648718 | 0.753010 | -6.220990 | -5.012004 | -1.518542 | -10.156440 |
| 0 | -0.118691 | 11.860546 | -2.567264 | -10.955913 | -4.239322 | -9.340552 | 21.189778 | -10.895375 | 2.659030 | -3.848115 | ... | 0.726191 | 11.634998 | -5.447248 | 1.293007 | -7.882002 | -2.527453 | 0.298939 | -6.107062 | 3.365051 | -15.641826 |
1657 rows × 25 columns
train_data<1657x23297 sparse matrix of type '<class 'numpy.int64'>'
with 106580 stored elements in Compressed Sparse Row format>
clf = KNeighborsClassifier(n_neighbors = 5)clf.fit(train_data_glove, twenty_train['target'])
<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: "▸";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: "▾";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: "";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style>
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
test_data_glove = text2vec(twenty_test['data']);test_data_glove
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -6.760635 | 5.063863 | -2.779060 | 3.699120 | -2.858086 | 0.135230 | 20.811229 | -19.425567 | 7.302950 | -5.826012 | ... | 3.833378 | 6.794452 | -0.921720 | 12.187404 | -5.547615 | -4.133999 | 3.588260 | -0.497106 | -2.542142 | -11.362855 |
| 0 | 1.632616 | 2.512300 | -0.745513 | -3.081154 | 2.182067 | -1.988816 | 7.533100 | -1.015740 | -0.829598 | -2.764237 | ... | 0.791851 | 2.114150 | -2.249193 | -0.163590 | -1.177710 | -2.496928 | -5.074085 | -2.666947 | 0.662050 | -3.590550 |
| 0 | 2.115766 | 2.142060 | -0.445607 | -3.229030 | 1.154580 | -2.877278 | 6.399954 | -10.445769 | 2.230760 | -3.299899 | ... | 4.388870 | 8.515056 | -0.766260 | 3.549431 | -1.643443 | -0.825730 | -2.968016 | -0.808924 | -0.000160 | -7.468189 |
| 0 | -0.802784 | 5.199443 | 4.294071 | -7.390966 | 2.747166 | -1.359952 | 15.032628 | -1.601590 | 1.474406 | 2.570105 | ... | 3.043432 | 6.176236 | -6.193988 | -3.990476 | -2.345854 | -5.534376 | -8.925422 | 1.553300 | 0.905790 | -12.824533 |
| 0 | 29.926489 | 65.324993 | -25.059592 | -64.080130 | 77.565282 | -34.614604 | 75.643770 | -115.600859 | 90.847175 | -42.971146 | ... | 40.956031 | 50.322156 | -19.537098 | 28.903925 | -34.643949 | -69.894146 | -94.992145 | -48.601895 | -29.098555 | -91.934770 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | 1.829235 | 4.513807 | 2.916520 | 2.237308 | -1.704831 | -1.811192 | 22.196895 | -12.858912 | -4.054810 | -3.130457 | ... | 6.019246 | 8.949456 | -4.682214 | -5.648911 | -1.026898 | 3.719006 | 2.449941 | -6.487197 | 1.340930 | -7.325196 |
| 0 | -0.963815 | 5.491164 | 3.567377 | -6.048021 | -5.059298 | -0.977958 | 15.131499 | -0.904470 | 2.185990 | -1.459807 | ... | 0.968499 | 4.725793 | -0.726944 | 1.328612 | -3.144209 | 1.643127 | -1.259245 | -0.880740 | -6.713165 | -3.115454 |
| 0 | 6.801324 | 15.348126 | -17.051718 | 5.030998 | 9.332448 | -5.716691 | 56.409175 | -56.250411 | -4.028209 | -11.687558 | ... | 22.884424 | 12.940570 | 1.058664 | 21.879058 | -20.897253 | 2.537755 | 3.774890 | -11.495336 | -2.609774 | -36.597559 |
| 0 | 1.054090 | 0.764524 | 1.958340 | -1.085245 | -0.441392 | -0.421970 | 6.139770 | -0.612219 | -2.251460 | -0.465165 | ... | 0.377958 | 1.957450 | -1.705220 | -0.509700 | 0.016110 | 1.461620 | 1.589069 | 2.267340 | 0.447919 | -0.469250 |
| 0 | -18.387286 | 13.274879 | -7.895913 | -1.831442 | -10.424961 | -12.248442 | 32.153890 | -40.169293 | 13.089525 | -21.306493 | ... | 6.497279 | 8.340729 | 4.996109 | 23.442078 | -3.701088 | -11.671505 | 9.209790 | -10.002501 | -0.815266 | -17.024052 |
1102 rows × 25 columns
predict = clf.predict(test_data_glove )print (confusion_matrix(twenty_test['target'], predict))
print(classification_report(twenty_test['target'], predict))[[225 35 59]
[ 26 313 50]
[ 56 98 240]]
precision recall f1-score support
0 0.73 0.71 0.72 319
1 0.70 0.80 0.75 389
2 0.69 0.61 0.65 394
accuracy 0.71 1102
macro avg 0.71 0.71 0.70 1102
weighted avg 0.71 0.71 0.70 1102