63 KiB
63 KiB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from sklearn.pipeline import Pipelineimport gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']
GloVe
glove_model = gensim.downloader.load("glove-twitter-25") # load glove vectorsprint(glove_model['cat']) # word embedding for 'cat'
glove_model.most_similar("cat") # show words that similar to word 'cat'[-0.96419 -0.60978 0.67449 0.35113 0.41317 -0.21241 1.3796
0.12854 0.31567 0.66325 0.3391 -0.18934 -3.325 -1.1491
-0.4129 0.2195 0.8706 -0.50616 -0.12781 -0.066965 0.065761
0.43927 0.1758 -0.56058 0.13529 ]
[('dog', 0.9590820074081421),
('monkey', 0.920357882976532),
('bear', 0.9143136739730835),
('pet', 0.9108031392097473),
('girl', 0.8880629539489746),
('horse', 0.8872726559638977),
('kitty', 0.8870542049407959),
('puppy', 0.886769711971283),
('hot', 0.886525571346283),
('lady', 0.8845519423484802)]
glove_model.similarity('cat', 'bus')0.60927683
categories = ['alt.atheism', 'comp.graphics', 'sci.space']
remove = ('headers', 'footers', 'quotes')
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )Векторизуем обучающую выборку
Получаем матрицу "Документ-термин"
vectorizer = CountVectorizer(stop_words='english')train_data = vectorizer.fit_transform(twenty_train['data'])
CV_data=pd.DataFrame(train_data.toarray(), columns=vectorizer.get_feature_names_out())
print(CV_data.shape)
CV_data.head()(1657, 23297)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 00 | 000 | 0000 | 00000 | 000000 | 000005102000 | 000062david42 | 000100255pixel | 00041032 | 0004136 | ... | zurbrin | zurich | zus | zvi | zwaartepunten | zwak | zwakke | zware | zwarte | zyxel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
5 rows × 23297 columns
# Создадим список слов, присутствующих в словаре.
words_vocab=CV_data.columns
print(words_vocab[0:10])Index(['00', '000', '0000', '00000', '000000', '000005102000', '000062david42',
'000100255pixel', '00041032', '0004136'],
dtype='object')
Векторизуем с помощью GloVe
Нужно для каждого документа сложить glove-вектора слов, из которых он состоит. В результате получим вектор документа как сумму векторов слов, из него состоящих
Посмотрим на примере как будет работать векторизация
# Пусть выборка состоит из двух документов:
text_data = ['Hello world I love python', 'This is a great computer game! 00 000 zyxel']
# Векторизуем с помощью обученного CountVectorizer
X = vectorizer.transform(text_data)
CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 00 | 000 | 0000 | 00000 | 000000 | 000005102000 | 000062david42 | 000100255pixel | 00041032 | 0004136 | ... | zurbrin | zurich | zus | zvi | zwaartepunten | zwak | zwakke | zware | zwarte | zyxel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 rows × 23297 columns
# Создадим датафрейм, в который будем сохранять вектор документа
glove_data=pd.DataFrame()
# Пробегаем по каждой строке датафрейма (по каждому документу)
for i in range(CV_text_data.shape[0]):
# Вектор одного документа с размерностью glove-модели:
one_doc = np.zeros(25)
# Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
# Суммируем glove-вектора каждого известного слова в one_doc
for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
if word in glove_model.key_to_index.keys():
print(word, ': ', glove_model[word])
one_doc += glove_model[word]
print(text_data[i], ': ', one_doc)
glove_data=glove_data.append(pd.DataFrame([one_doc])) hello : [-0.77069 0.12827 0.33137 0.0050893 -0.47605 -0.50116
1.858 1.0624 -0.56511 0.13328 -0.41918 -0.14195
-2.8555 -0.57131 -0.13418 -0.44922 0.48591 -0.6479
-0.84238 0.61669 -0.19824 -0.57967 -0.65885 0.43928
-0.50473 ]
love : [-0.62645 -0.082389 0.070538 0.5782 -0.87199 -0.14816 2.2315
0.98573 -1.3154 -0.34921 -0.8847 0.14585 -4.97 -0.73369
-0.94359 0.035859 -0.026733 -0.77538 -0.30014 0.48853 -0.16678
-0.016651 -0.53164 0.64236 -0.10922 ]
python : [-0.25645 -0.22323 0.025901 0.22901 0.49028 -0.060829 0.24563
-0.84854 1.5882 -0.7274 0.60603 0.25205 -1.8064 -0.95526
0.44867 0.013614 0.60856 0.65423 0.82506 0.99459 -0.29403
-0.27013 -0.348 -0.7293 0.2201 ]
world : [ 0.10301 0.095666 -0.14789 -0.22383 -0.14775 -0.11599 1.8513
0.24886 -0.41877 -0.20384 -0.08509 0.33246 -4.6946 0.84096
-0.46666 -0.031128 -0.19539 -0.037349 0.58949 0.13941 -0.57667
-0.44426 -0.43085 -0.52875 0.25855 ]
Hello world I love python : [ -1.55058002 -0.081683 0.27991899 0.58846928 -1.00551002
-0.82613902 6.18642995 1.44844997 -0.71108004 -1.14717001
-0.78294002 0.58841 -14.32649982 -1.41929996 -1.09575997
-0.430875 0.87234702 -0.806399 0.27203003 2.23921998
-1.23571999 -1.31071102 -1.96934 -0.17641005 -0.1353 ]
computer : [ 0.64005 -0.019514 0.70148 -0.66123 1.1723 -0.58859 0.25917
-0.81541 1.1708 1.1413 -0.15405 -0.11369 -3.8414 -0.87233
0.47489 1.1541 0.97678 1.1107 -0.14572 -0.52013 -0.52234
-0.92349 0.34651 0.061939 -0.57375 ]
game : [ 1.146 0.3291 0.26878 -1.3945 -0.30044 0.77901 1.3537
0.37393 0.50478 -0.44266 -0.048706 0.51396 -4.3136 0.39805
1.197 0.10287 -0.17618 -1.2881 -0.59801 0.26131 -1.2619
0.39202 0.59309 -0.55232 0.005087]
great : [-8.4229e-01 3.6512e-01 -3.8841e-01 -4.6118e-01 2.4301e-01 3.2412e-01
1.9009e+00 -2.2630e-01 -3.1335e-01 -1.0970e+00 -4.1494e-03 6.2074e-01
-5.0964e+00 6.7418e-01 5.0080e-01 -6.2119e-01 5.1765e-01 -4.4122e-01
-1.4364e-01 1.9130e-01 -7.4608e-01 -2.5903e-01 -7.8010e-01 1.1030e-01
-2.7928e-01]
zyxel : [ 0.79234 0.067376 -0.22639 -2.2272 0.30057 -0.85676 -1.7268
-0.78626 1.2042 -0.92348 -0.83987 -0.74233 0.29689 -1.208
0.98706 -1.1624 0.61415 -0.27825 0.27813 1.5838 -0.63593
-0.10225 1.7102 -0.95599 -1.3867 ]
This is a great computer game! 00 000 zyxel : [ 1.73610002 0.74208201 0.35545996 -4.74411008 1.41543998
-0.34222007 1.78697008 -1.45404002 2.56643 -1.32184002
-1.04677537 0.27867999 -12.95450976 -1.00809997 3.15975004
-0.52662008 1.93239999 -0.89686999 -0.60924001 1.51628
-3.16624993 -0.89275002 1.86969995 -1.33607102 -2.23464306]
C:\Users\Андрей\AppData\Local\Temp\ipykernel_8524\2010506005.py:17: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
glove_data=glove_data.append(pd.DataFrame([one_doc]))
def text2vec(text_data):
# Векторизуем с помощью обученного CountVectorizer
X = vectorizer.transform(text_data)
CV_text_data=pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
CV_text_data
# Создадим датафрейм, в который будем сохранять вектор документа
glove_data=pd.DataFrame()
# Пробегаем по каждой строке (по каждому документу)
for i in range(CV_text_data.shape[0]):
# Вектор одного документа с размерностью glove-модели:
one_doc = np.zeros(25)
# Пробегаемся по каждому документу, смотрим, какие слова документа присутствуют в нашем словаре
# Суммируем glove-вектора каждого известного слова в one_doc
for word in words_vocab[CV_text_data.iloc[i,:] >= 1]:
if word in glove_model.key_to_index.keys():
#print(word, ': ', glove_model[word])
one_doc += glove_model[word]
#print(text_data[i], ': ', one_doc)
glove_data = pd.concat([glove_data, pd.DataFrame([one_doc])], axis = 0)
#print('glove_data: ', glove_data)
return glove_data
glove_data
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -1.55058 | -0.081683 | 0.279919 | 0.588469 | -1.00551 | -0.826139 | 6.18643 | 1.44845 | -0.71108 | -1.14717 | ... | -0.430875 | 0.872347 | -0.806399 | 0.27203 | 2.23922 | -1.23572 | -1.310711 | -1.96934 | -0.176410 | -0.135300 |
| 0 | 1.73610 | 0.742082 | 0.355460 | -4.744110 | 1.41544 | -0.342220 | 1.78697 | -1.45404 | 2.56643 | -1.32184 | ... | -0.526620 | 1.932400 | -0.896870 | -0.60924 | 1.51628 | -3.16625 | -0.892750 | 1.86970 | -1.336071 | -2.234643 |
2 rows × 25 columns
train_data_glove = text2vec(twenty_train['data'])
train_data_glove
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -8.521142 | 2.020376 | -10.802921 | 3.167636 | 0.252469 | 15.544048 | 17.631184 | -32.581192 | 9.696540 | -11.103087 | ... | 2.810453 | 7.900215 | 0.962129 | 17.691130 | -1.252574 | -10.098049 | 0.500113 | 1.348694 | 2.186150 | -16.556824 |
| 0 | 6.576228 | 20.336350 | -32.675150 | -9.073872 | 17.515655 | -6.488794 | 59.458419 | -75.384298 | 13.323775 | -14.443218 | ... | 21.407738 | 23.525118 | 0.325680 | 19.871444 | -27.585188 | -4.559155 | -7.417482 | -16.694553 | -0.197711 | -58.948193 |
| 0 | 1.329914 | 3.060870 | -1.868484 | 1.392735 | -1.335277 | -5.014955 | 12.859476 | -9.978156 | -0.869613 | -2.031490 | ... | 2.925134 | 2.872930 | 2.184486 | 3.831770 | -0.877866 | -0.927770 | 0.700101 | -9.855365 | -5.419429 | -2.279330 |
| 0 | -4.866150 | -0.273176 | 3.515124 | -5.008165 | -1.236789 | -7.951168 | -11.015882 | -3.496241 | 16.024286 | -9.388742 | ... | -0.471141 | 3.575378 | 6.193222 | 0.349430 | 15.040248 | -10.369132 | -0.848717 | -0.564796 | -1.114126 | -7.844431 |
| 0 | -3.115007 | -1.805252 | -5.419340 | -0.393406 | -0.406461 | -2.724340 | 7.898330 | -15.619113 | 0.231822 | -3.628156 | ... | 5.944151 | 8.309932 | -0.656084 | 12.178709 | -6.118551 | -3.286376 | 3.450946 | 2.055343 | 0.463787 | -12.644626 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | -0.930954 | 4.974043 | -8.147008 | -5.147130 | 3.960455 | -1.344022 | 7.818063 | -25.427420 | 4.624732 | -7.218097 | ... | 3.623038 | 4.453189 | 2.405320 | 8.032963 | -8.029539 | 0.838867 | -4.757457 | -5.755052 | -9.496197 | -21.542710 |
| 0 | -0.770690 | 0.128270 | 0.331370 | 0.005089 | -0.476050 | -0.501160 | 1.858000 | 1.062400 | -0.565110 | 0.133280 | ... | -0.449220 | 0.485910 | -0.647900 | -0.842380 | 0.616690 | -0.198240 | -0.579670 | -0.658850 | 0.439280 | -0.504730 |
| 0 | 1.491177 | 6.992638 | -7.921970 | -7.157521 | 6.641657 | -2.958020 | 12.820770 | -18.502946 | 6.838083 | -2.717310 | ... | -1.344873 | 4.170405 | -0.178030 | 5.699992 | -7.295038 | -3.683306 | -2.718006 | -0.117608 | -7.205832 | -13.863438 |
| 0 | 2.523770 | 5.817394 | 2.184340 | -2.996497 | -0.267181 | -10.059634 | 6.344402 | -2.047127 | 2.679123 | -7.642505 | ... | -1.230296 | 1.409746 | -3.322040 | -5.068259 | -0.648718 | 0.753010 | -6.220990 | -5.012004 | -1.518542 | -10.156440 |
| 0 | -0.118691 | 11.860546 | -2.567264 | -10.955913 | -4.239322 | -9.340552 | 21.189778 | -10.895375 | 2.659030 | -3.848115 | ... | 0.726191 | 11.634998 | -5.447248 | 1.293007 | -7.882002 | -2.527453 | 0.298939 | -6.107062 | 3.365051 | -15.641826 |
1657 rows × 25 columns
clf = KNeighborsClassifier(n_neighbors = 5)clf.fit(train_data_glove, twenty_train['target'])
<style>#sk-container-id-1 {color: black;background-color: white;}#sk-container-id-1 pre{padding: 0;}#sk-container-id-1 div.sk-toggleable {background-color: white;}#sk-container-id-1 label.sk-toggleable__label {cursor: pointer;display: block;width: 100%;margin-bottom: 0;padding: 0.3em;box-sizing: border-box;text-align: center;}#sk-container-id-1 label.sk-toggleable__label-arrow:before {content: "▸";float: left;margin-right: 0.25em;color: #696969;}#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {color: black;}#sk-container-id-1 div.sk-estimator:hover label.sk-toggleable__label-arrow:before {color: black;}#sk-container-id-1 div.sk-toggleable__content {max-height: 0;max-width: 0;overflow: hidden;text-align: left;background-color: #f0f8ff;}#sk-container-id-1 div.sk-toggleable__content pre {margin: 0.2em;color: black;border-radius: 0.25em;background-color: #f0f8ff;}#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {max-height: 200px;max-width: 100%;overflow: auto;}#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {content: "▾";}#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 input.sk-hidden--visually {border: 0;clip: rect(1px 1px 1px 1px);clip: rect(1px, 1px, 1px, 1px);height: 1px;margin: -1px;overflow: hidden;padding: 0;position: absolute;width: 1px;}#sk-container-id-1 div.sk-estimator {font-family: monospace;background-color: #f0f8ff;border: 1px dotted black;border-radius: 0.25em;box-sizing: border-box;margin-bottom: 0.5em;}#sk-container-id-1 div.sk-estimator:hover {background-color: #d4ebff;}#sk-container-id-1 div.sk-parallel-item::after {content: "";width: 100%;border-bottom: 1px solid gray;flex-grow: 1;}#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {background-color: #d4ebff;}#sk-container-id-1 div.sk-serial::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: 0;}#sk-container-id-1 div.sk-serial {display: flex;flex-direction: column;align-items: center;background-color: white;padding-right: 0.2em;padding-left: 0.2em;position: relative;}#sk-container-id-1 div.sk-item {position: relative;z-index: 1;}#sk-container-id-1 div.sk-parallel {display: flex;align-items: stretch;justify-content: center;background-color: white;position: relative;}#sk-container-id-1 div.sk-item::before, #sk-container-id-1 div.sk-parallel-item::before {content: "";position: absolute;border-left: 1px solid gray;box-sizing: border-box;top: 0;bottom: 0;left: 50%;z-index: -1;}#sk-container-id-1 div.sk-parallel-item {display: flex;flex-direction: column;z-index: 1;position: relative;background-color: white;}#sk-container-id-1 div.sk-parallel-item:first-child::after {align-self: flex-end;width: 50%;}#sk-container-id-1 div.sk-parallel-item:last-child::after {align-self: flex-start;width: 50%;}#sk-container-id-1 div.sk-parallel-item:only-child::after {width: 0;}#sk-container-id-1 div.sk-dashed-wrapped {border: 1px dashed gray;margin: 0 0.4em 0.5em 0.4em;box-sizing: border-box;padding-bottom: 0.4em;background-color: white;}#sk-container-id-1 div.sk-label label {font-family: monospace;font-weight: bold;display: inline-block;line-height: 1.2em;}#sk-container-id-1 div.sk-label-container {text-align: center;}#sk-container-id-1 div.sk-container {/* jupyter's `normalize.less` sets `[hidden] { display: none; }` but bootstrap.min.css set `[hidden] { display: none !important; }` so we also need the `!important` here to be able to override the default hidden behavior on the sphinx rendered scikit-learn.org. See: https://github.com/scikit-learn/scikit-learn/issues/21755 */display: inline-block !important;position: relative;}#sk-container-id-1 div.sk-text-repr-fallback {display: none;}</style>
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
test_data_glove = text2vec(twenty_test['data'])test_data_glove
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -6.760635 | 5.063863 | -2.779060 | 3.699120 | -2.858086 | 0.135230 | 20.811229 | -19.425567 | 7.302950 | -5.826012 | ... | 3.833378 | 6.794452 | -0.921720 | 12.187404 | -5.547615 | -4.133999 | 3.588260 | -0.497106 | -2.542142 | -11.362855 |
| 0 | 1.632616 | 2.512300 | -0.745513 | -3.081154 | 2.182067 | -1.988816 | 7.533100 | -1.015740 | -0.829598 | -2.764237 | ... | 0.791851 | 2.114150 | -2.249193 | -0.163590 | -1.177710 | -2.496928 | -5.074085 | -2.666947 | 0.662050 | -3.590550 |
| 0 | 2.115766 | 2.142060 | -0.445607 | -3.229030 | 1.154580 | -2.877278 | 6.399954 | -10.445769 | 2.230760 | -3.299899 | ... | 4.388870 | 8.515056 | -0.766260 | 3.549431 | -1.643443 | -0.825730 | -2.968016 | -0.808924 | -0.000160 | -7.468189 |
| 0 | -0.802784 | 5.199443 | 4.294071 | -7.390966 | 2.747166 | -1.359952 | 15.032628 | -1.601590 | 1.474406 | 2.570105 | ... | 3.043432 | 6.176236 | -6.193988 | -3.990476 | -2.345854 | -5.534376 | -8.925422 | 1.553300 | 0.905790 | -12.824533 |
| 0 | 29.926489 | 65.324993 | -25.059592 | -64.080130 | 77.565282 | -34.614604 | 75.643770 | -115.600859 | 90.847175 | -42.971146 | ... | 40.956031 | 50.322156 | -19.537098 | 28.903925 | -34.643949 | -69.894146 | -94.992145 | -48.601895 | -29.098555 | -91.934770 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 0 | 1.829235 | 4.513807 | 2.916520 | 2.237308 | -1.704831 | -1.811192 | 22.196895 | -12.858912 | -4.054810 | -3.130457 | ... | 6.019246 | 8.949456 | -4.682214 | -5.648911 | -1.026898 | 3.719006 | 2.449941 | -6.487197 | 1.340930 | -7.325196 |
| 0 | -0.963815 | 5.491164 | 3.567377 | -6.048021 | -5.059298 | -0.977958 | 15.131499 | -0.904470 | 2.185990 | -1.459807 | ... | 0.968499 | 4.725793 | -0.726944 | 1.328612 | -3.144209 | 1.643127 | -1.259245 | -0.880740 | -6.713165 | -3.115454 |
| 0 | 6.801324 | 15.348126 | -17.051718 | 5.030998 | 9.332448 | -5.716691 | 56.409175 | -56.250411 | -4.028209 | -11.687558 | ... | 22.884424 | 12.940570 | 1.058664 | 21.879058 | -20.897253 | 2.537755 | 3.774890 | -11.495336 | -2.609774 | -36.597559 |
| 0 | 1.054090 | 0.764524 | 1.958340 | -1.085245 | -0.441392 | -0.421970 | 6.139770 | -0.612219 | -2.251460 | -0.465165 | ... | 0.377958 | 1.957450 | -1.705220 | -0.509700 | 0.016110 | 1.461620 | 1.589069 | 2.267340 | 0.447919 | -0.469250 |
| 0 | -18.387286 | 13.274879 | -7.895913 | -1.831442 | -10.424961 | -12.248442 | 32.153890 | -40.169293 | 13.089525 | -21.306493 | ... | 6.497279 | 8.340729 | 4.996109 | 23.442078 | -3.701088 | -11.671505 | 9.209790 | -10.002501 | -0.815266 | -17.024052 |
1102 rows × 25 columns
predict = clf.predict(test_data_glove )print (confusion_matrix(twenty_test['target'], predict))
print(classification_report(twenty_test['target'], predict))[[225 35 59]
[ 26 313 50]
[ 56 98 240]]
precision recall f1-score support
0 0.73 0.71 0.72 319
1 0.70 0.80 0.75 389
2 0.69 0.61 0.65 394
accuracy 0.71 1102
macro avg 0.71 0.71 0.70 1102
weighted avg 0.71 0.71 0.70 1102