{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "3dda6a69", "metadata": {}, "outputs": [], "source": [ "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer \n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score\n", "from sklearn.pipeline import Pipeline" ] }, { "cell_type": "code", "execution_count": 5, "id": "7fd6636b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']\n" ] } ], "source": [ "import gensim.downloader\n", "print(list(gensim.downloader.info()['models'].keys()))" ] }, { "cell_type": "markdown", "id": "3f93b5f6", "metadata": {}, "source": [ "# GloVe" ] }, { "cell_type": "code", "execution_count": 6, "id": "be870586", "metadata": {}, "outputs": [], "source": [ "glove_model = gensim.downloader.load(\"glove-twitter-25\") # load glove vectors\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "599d6406", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[-0.96419 -0.60978 0.67449 0.35113 0.41317 -0.21241 1.3796\n", " 0.12854 0.31567 0.66325 0.3391 -0.18934 -3.325 -1.1491\n", " -0.4129 0.2195 0.8706 -0.50616 -0.12781 -0.066965 0.065761\n", " 0.43927 0.1758 -0.56058 0.13529 ]\n" ] }, { "data": { "text/plain": [ "[('dog', 0.9590820074081421),\n", " ('monkey', 0.920357882976532),\n", " ('bear', 0.9143136739730835),\n", " ('pet', 0.9108031392097473),\n", " ('girl', 0.8880629539489746),\n", " ('horse', 0.8872726559638977),\n", " ('kitty', 0.8870542049407959),\n", " ('puppy', 0.886769711971283),\n", " ('hot', 0.886525571346283),\n", " ('lady', 0.8845519423484802)]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(glove_model['cat']) # word embedding for 'cat'\n", "glove_model.most_similar(\"cat\") # show words that similar to word 'cat'" ] }, { "cell_type": "code", "execution_count": 8, "id": "2db71cfb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.60927683" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "glove_model.similarity('cat', 'bus')" ] }, { "cell_type": "code", "execution_count": 9, "id": "7788acf5", "metadata": {}, "outputs": [], "source": [ "categories = ['alt.atheism', 'comp.graphics', 'sci.space'] \n", "remove = ('headers', 'footers', 'quotes')\n", "twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, categories = categories, remove = remove )\n", "twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, categories = categories, remove = remove )\n" ] }, { "cell_type": "markdown", "id": "79dd1ac1", "metadata": {}, "source": [ "# Векторизуем обучающую выборку\n", "Получаем матрицу \"Документ-термин\"" ] }, { "cell_type": "code", "execution_count": 10, "id": "0565dd1a", "metadata": {}, "outputs": [], "source": [ "vectorizer = CountVectorizer(stop_words='english')" ] }, { "cell_type": "code", "execution_count": 11, "id": "a681a1d6", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1657, 23297)\n" ] }, { "data": { "text/html": [ "
\n", " | 00 | \n", "000 | \n", "0000 | \n", "00000 | \n", "000000 | \n", "000005102000 | \n", "000062david42 | \n", "000100255pixel | \n", "00041032 | \n", "0004136 | \n", "... | \n", "zurbrin | \n", "zurich | \n", "zus | \n", "zvi | \n", "zwaartepunten | \n", "zwak | \n", "zwakke | \n", "zware | \n", "zwarte | \n", "zyxel | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
1 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
2 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
3 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
4 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "... | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 rows × 23297 columns
\n", "