From 5bc2f0c7b7aed92781a9e228f3cae440bc261de0 Mon Sep 17 00:00:00 2001 From: Andrey Date: Thu, 12 Dec 2024 16:13:28 +0300 Subject: [PATCH] lec-12 recsys --- README.md | 4 +- assets/recsys/.gitignore | 2 + assets/recsys/requirements.txt | 9 + assets/recsys/research/recommendations.ipynb | 4637 +++++++++++++++++ assets/recsys/service/events/Dockerfile | 9 + .../recsys/service/events/events_service.py | 54 + assets/recsys/service/events/requirements.txt | 2 + assets/recsys/service/features/Dockerfile | 9 + .../service/features/feature_service.py | 57 + .../recsys/service/features/requirements.txt | 6 + .../recsys/service/recommendations/Dockerfile | 9 + .../service/recommendations/rec_handler.py | 43 + .../recommendations/recommendation_service.py | 116 + .../service/recommendations/requirements.txt | 12 + 14 files changed, 4968 insertions(+), 1 deletion(-) create mode 100644 assets/recsys/.gitignore create mode 100644 assets/recsys/requirements.txt create mode 100644 assets/recsys/research/recommendations.ipynb create mode 100644 assets/recsys/service/events/Dockerfile create mode 100644 assets/recsys/service/events/events_service.py create mode 100644 assets/recsys/service/events/requirements.txt create mode 100644 assets/recsys/service/features/Dockerfile create mode 100644 assets/recsys/service/features/feature_service.py create mode 100644 assets/recsys/service/features/requirements.txt create mode 100644 assets/recsys/service/recommendations/Dockerfile create mode 100644 assets/recsys/service/recommendations/rec_handler.py create mode 100644 assets/recsys/service/recommendations/recommendation_service.py create mode 100644 assets/recsys/service/recommendations/requirements.txt diff --git a/README.md b/README.md index aed965a..c070272 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,9 @@ | 19.11.2024 | [Docker compose](./lectures/lec10-docker_compose.pptx) | | 21.11.2024 | [Мониторинг сервиса. Prometheus. Graphana](./lectures/lec11-monitoring.pptx) | | 28.11.2024 | [Работа с БД](./lectures/lec12-database.pptx) | - +| 05.12.2024 | [Рекомендательные системы](./lectures/lec13-recsys.pptx) | +| 12.12.2024 | Создание сервиса рекомендаций - код в [директории](./assets/recsys)| +| 19.12.2024 | Создание сервиса рекомендаций - код в [директории](./assets/recsys)| ## Перенос занятий ### Лекции diff --git a/assets/recsys/.gitignore b/assets/recsys/.gitignore new file mode 100644 index 0000000..22a94e1 --- /dev/null +++ b/assets/recsys/.gitignore @@ -0,0 +1,2 @@ +*.parquet +*__pycache__* diff --git a/assets/recsys/requirements.txt b/assets/recsys/requirements.txt new file mode 100644 index 0000000..2c909dc --- /dev/null +++ b/assets/recsys/requirements.txt @@ -0,0 +1,9 @@ +pandas +matplotlib +seaborn +pyarrow==13.0.0 +mlflow==2.7.1 +implicit==0.7.2 +catboost +fastapi +uvicorn diff --git a/assets/recsys/research/recommendations.ipynb b/assets/recsys/research/recommendations.ipynb new file mode 100644 index 0000000..ebc8257 --- /dev/null +++ b/assets/recsys/research/recommendations.ipynb @@ -0,0 +1,4637 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 53, + "id": "662d04e7-1b0b-4e4a-9ddf-4526d7fef119", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "id": "bf80fbc5-b660-4fac-8fbb-a5cae77313b3", + "metadata": {}, + "source": [ + "# === ЭТАП 1 ===" + ] + }, + { + "cell_type": "markdown", + "id": "5263a8b3-fe99-4204-8a2e-105182792c11", + "metadata": {}, + "source": [ + "# Загрузка первичных данных" + ] + }, + { + "cell_type": "markdown", + "id": "1b54a6a5-1656-4e3c-99d1-49dc39451d33", + "metadata": {}, + "source": [ + "Загружаем первичные данные из файлов:\n", + "- tracks.parquet\n", + "- catalog_names.parquet\n", + "- interactions.parquet" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "5d4b8961-3f35-4e58-9d6b-3e2dbd2c4224", + "metadata": {}, + "outputs": [], + "source": [ + "tracks = pd.read_parquet('../data/tracks.parquet')\n", + "interactions = pd.read_parquet('../data/interactions.parquet')\n", + "catalog = pd.read_parquet('../data/catalog_names.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "e8f2a1f7-a05f-4f39-af90-5f4018aa6f9d", + "metadata": {}, + "source": [ + "# Обзор данных" + ] + }, + { + "cell_type": "markdown", + "id": "46a85307-896c-4fac-9fcf-f0dffa90889e", + "metadata": {}, + "source": [ + "Проверяем данные, есть ли с ними явные проблемы." + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "c9f8f17e-9b56-4f5a-a463-f694a993effb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
track_idalbumsartistsgenresnamecount
026[3, 2490753][16][11, 21]Complimentary Me5
138[3, 2490753][16][11, 21]Momma's Boy8
2135[12, 214, 2490809][84][11]Atticus16
3136[12, 214, 2490809][84][11]24 Hours7
4138[12, 214, 322, 72275, 72292, 91199, 213505, 24...[84][11]Don't Upset The Rhythm (Go Baby Go)17
\n", + "
" + ], + "text/plain": [ + " track_id albums artists \\\n", + "0 26 [3, 2490753] [16] \n", + "1 38 [3, 2490753] [16] \n", + "2 135 [12, 214, 2490809] [84] \n", + "3 136 [12, 214, 2490809] [84] \n", + "4 138 [12, 214, 322, 72275, 72292, 91199, 213505, 24... [84] \n", + "\n", + " genres name count \n", + "0 [11, 21] Complimentary Me 5 \n", + "1 [11, 21] Momma's Boy 8 \n", + "2 [11] Atticus 16 \n", + "3 [11] 24 Hours 7 \n", + "4 [11] Don't Upset The Rhythm (Go Baby Go) 17 " + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tracks.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0d9d8b5d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 932664 entries, 0 to 932663\n", + "Data columns (total 6 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 track_id 932664 non-null int64 \n", + " 1 albums 932664 non-null object\n", + " 2 artists 932664 non-null object\n", + " 3 genres 932664 non-null object\n", + " 4 name 932664 non-null object\n", + " 5 count 932664 non-null int64 \n", + "dtypes: int64(2), object(4)\n", + "memory usage: 42.7+ MB\n" + ] + } + ], + "source": [ + "tracks.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "092a1f8b", + "metadata": {}, + "outputs": [], + "source": [ + "tracks['empty_genre'] = tracks['genres'].apply(lambda data: len(data) == 0)\n", + "tracks['empty_album'] = tracks['albums'].apply(lambda data: len(data) == 0)\n", + "tracks['empty_artist'] = tracks['artists'].apply(lambda data: len(data) == 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "7bd05c0f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "932664" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tracks = tracks.loc[~tracks['empty_artist']]\n", + "tracks = tracks.loc[~tracks['empty_album']]\n", + "tracks = tracks.loc[~tracks['empty_genre']]\n", + "len(tracks)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "6e455a1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypename
03albumTaller Children
112albumWild Young Hearts
213albumLonesome Crow
317albumGraffiti Soul
426albumBlues Six Pack
\n", + "
" + ], + "text/plain": [ + " id type name\n", + "0 3 album Taller Children\n", + "1 12 album Wild Young Hearts\n", + "2 13 album Lonesome Crow\n", + "3 17 album Graffiti Soul\n", + "4 26 album Blues Six Pack" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "97a67071", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['album', 'artist', 'genre', 'track'], dtype=object)" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog.type.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "5ceb5505", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idtypename
81911638012trackThe Riddle
\n", + "
" + ], + "text/plain": [ + " id type name\n", + "819116 38012 track The Riddle" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog.loc[(catalog.type == 'track') & (catalog.id == 38012 )]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "61fc07be", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "166\n", + "658724\n", + "153581\n" + ] + } + ], + "source": [ + "genre_list = catalog.loc[catalog.type=='genre', 'id'].values\n", + "print(len(genre_list))\n", + "album_list = catalog.loc[catalog.type=='album', 'id'].values\n", + "print(len(album_list))\n", + "artist_list = catalog.loc[catalog.type=='artist', 'id'].values\n", + "print(len(artist_list))" + ] + }, + { + "cell_type": "markdown", + "id": "4b96df54", + "metadata": {}, + "source": [ + "Проверяем, есть ли неизвестные данные в столбцах (которых нет в каталоге)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0c302f8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
track_idalbumsartistsgenresnamecountempty_genreempty_albumempty_artistunknown_genreunknown_artistunknown_album
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [track_id, albums, artists, genres, name, count, empty_genre, empty_album, empty_artist, unknown_genre, unknown_artist, unknown_album]\n", + "Index: []" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tracks['unknown_genre'] = tracks['genres'].apply(lambda alb: sum([1 if a in genre_list else 0 for a in alb]) / len(alb))\n", + "tracks.query('unknown_genre < 1')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "efd2e2eb-3bec-4ce1-87ac-232bab8bc0d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
track_idalbumsartistsgenresnamecountempty_genreempty_albumempty_artistunknown_genreunknown_artist
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [track_id, albums, artists, genres, name, count, empty_genre, empty_album, empty_artist, unknown_genre, unknown_artist]\n", + "Index: []" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tracks['unknown_artist'] = tracks['artists'].apply(lambda alb: sum([1 if a in artist_list else 0 for a in alb]) / len(alb))\n", + "tracks.query('unknown_artist < 1')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "f462a430", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
track_idalbumsartistsgenresnamecountempty_genreempty_albumempty_artistunknown_genreunknown_artistunknown_album
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [track_id, albums, artists, genres, name, count, empty_genre, empty_album, empty_artist, unknown_genre, unknown_artist, unknown_album]\n", + "Index: []" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 10 minutes\n", + "tracks['unknown_album'] = tracks['albums'].apply(lambda alb: sum([1 if a in album_list else 0 for a in alb]) / len(alb))\n", + "tracks.query('unknown_album < 1')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2e0fa8d5", + "metadata": {}, + "outputs": [], + "source": [ + "tracks = tracks.loc[tracks['unknown_genre'] == 1]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5f6d9ced", + "metadata": {}, + "outputs": [], + "source": [ + "track_list = tracks.track_id.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2a64e8e6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 214714415 entries, 0 to 291\n", + "Data columns (total 4 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 user_id int32 \n", + " 1 track_id int32 \n", + " 2 track_seq int16 \n", + " 3 started_at datetime64[ns]\n", + "dtypes: datetime64[ns](1), int16(1), int32(2)\n", + "memory usage: 5.2 GB\n" + ] + }, + { + "data": { + "text/plain": [ + "user_id 1368700\n", + "track_id 932664\n", + "track_seq 16636\n", + "started_at 365\n", + "dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "interactions.info()\n", + "interactions.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "5ea6c6df", + "metadata": {}, + "outputs": [], + "source": [ + "interactions = interactions.loc[interactions['track_id'].isin(track_list)]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "55b0cb35", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 214714415 entries, 0 to 291\n", + "Data columns (total 4 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 user_id int32 \n", + " 1 track_id int32 \n", + " 2 track_seq int16 \n", + " 3 started_at datetime64[ns]\n", + "dtypes: datetime64[ns](1), int16(1), int32(2)\n", + "memory usage: 5.2 GB\n" + ] + }, + { + "data": { + "text/plain": [ + "user_id 1368700\n", + "track_id 932664\n", + "track_seq 16636\n", + "started_at 365\n", + "dtype: int64" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "interactions.info()\n", + "interactions.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "267e9d6d", + "metadata": {}, + "outputs": [], + "source": [ + "def flat_index(df_stats): \n", + " df_stats.columns = df_stats.columns.get_level_values(0) + '_' + df_stats.columns.get_level_values(1) \n", + " df_stats.columns.to_flat_index() \n", + " df_stats.columns = df_stats.columns.to_flat_index() \n", + " df_stats.reset_index(inplace=True) \n", + " return df_stats" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "064423eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['user_id', 'track_id', 'track_seq']" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "numeric_cols = interactions.select_dtypes(include=['int16','int32']).columns.tolist()\n", + "numeric_cols" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "7b22d39e", + "metadata": {}, + "outputs": [], + "source": [ + "users_grouped = interactions[numeric_cols].groupby('user_id').agg(['count', 'sum', 'max'])\n", + "users_grouped = flat_index(users_grouped)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "28240cf5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idtrack_id_counttrack_id_sumtrack_id_maxtrack_seq_counttrack_seq_sumtrack_seq_max
0026222329191204976212635126
11361433967998834367713666636
2213382479927716502001310314
33331437092275781949993356133
4424585705671429876629524531738256
........................
136869513745781136012196435785893116611
1368696137457923178064881418995162328924
13686971374580274105132440819661810127437911277
136869813745815032132966573699242432503126756503
136869913745822901762670172110073637529042717292
\n", + "

1368700 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " user_id track_id_count track_id_sum track_id_max track_seq_count \\\n", + "0 0 26 222329191 20497621 26 \n", + "1 1 36 1433967998 83436771 36 \n", + "2 2 13 382479927 71650200 13 \n", + "3 3 33 1437092275 78194999 33 \n", + "4 4 245 8570567142 98766295 245 \n", + "... ... ... ... ... ... \n", + "1368695 1374578 11 360121964 35785893 11 \n", + "1368696 1374579 23 178064881 41899516 23 \n", + "1368697 1374580 274 10513244081 96618101 274 \n", + "1368698 1374581 503 21329665736 99242432 503 \n", + "1368699 1374582 290 17626701721 100736375 290 \n", + "\n", + " track_seq_sum track_seq_max \n", + "0 351 26 \n", + "1 666 36 \n", + "2 103 14 \n", + "3 561 33 \n", + "4 31738 256 \n", + "... ... ... \n", + "1368695 66 11 \n", + "1368696 289 24 \n", + "1368697 37911 277 \n", + "1368698 126756 503 \n", + "1368699 42717 292 \n", + "\n", + "[1368700 rows x 7 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "users_grouped" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "93201b84", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
track_iduser_id_countuser_id_sumuser_id_maxtrack_seq_counttrack_seq_sumtrack_seq_max
026531873891008720551
138865109211304168892
21351611203352132458616161
31367525397412031847102
41381715656791132458617253
........................
932659101478482628096061055764655494910
9326601014901487244514110132215072389874911
9326611014930579685575912944319119694912
932662101495927201548586813205662097181146
9326631015218193424517206135041134193242768
\n", + "

932664 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " track_id user_id_count user_id_sum user_id_max track_seq_count \\\n", + "0 26 5 3187389 1008720 5 \n", + "1 38 8 6510921 1304168 8 \n", + "2 135 16 11203352 1324586 16 \n", + "3 136 7 5253974 1203184 7 \n", + "4 138 17 15656791 1324586 17 \n", + "... ... ... ... ... ... \n", + "932659 101478482 6 2809606 1055764 6 \n", + "932660 101490148 72 44514110 1322150 72 \n", + "932661 101493057 9 6855759 1294431 9 \n", + "932662 101495927 20 15485868 1320566 20 \n", + "932663 101521819 34 24517206 1350411 34 \n", + "\n", + " track_seq_sum track_seq_max \n", + "0 5 1 \n", + "1 9 2 \n", + "2 16 1 \n", + "3 10 2 \n", + "4 25 3 \n", + "... ... ... \n", + "932659 5549 4910 \n", + "932660 38987 4911 \n", + "932661 11969 4912 \n", + "932662 9718 1146 \n", + "932663 19324 2768 \n", + "\n", + "[932664 rows x 7 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tracks_grouped = interactions[numeric_cols].groupby('track_id').agg(['count','sum','max'])\n", + "tracks_grouped = flat_index(tracks_grouped)\n", + "tracks_grouped" + ] + }, + { + "cell_type": "markdown", + "id": "1acece8d", + "metadata": {}, + "source": [ + "Найдем пользователей с малой историей - для них нет смысла считать персональные рекомендации. " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "407cedc8", + "metadata": {}, + "outputs": [], + "source": [ + "small_users = users_grouped.query('track_id_count <= 3')['user_id'] # пользователи, прослушавшие менее 3 треков\n", + "interactions = interactions.loc[~interactions['user_id'].isin(small_users)]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "f6b86360", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "932664" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(interactions.track_id.unique())" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "3b02415b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idname
81247126Complimentary Me
81247238Momma's Boy
812473135Atticus
81247413624 Hours
812475138Don't Upset The Rhythm (Go Baby Go)
.........
1812466101478482На лицо
1812467101490148Без капли мысли
1812468101493057SKITTLES
1812469101495927Москва
1812470101521819Вокзал
\n", + "

1000000 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " id name\n", + "812471 26 Complimentary Me\n", + "812472 38 Momma's Boy\n", + "812473 135 Atticus\n", + "812474 136 24 Hours\n", + "812475 138 Don't Upset The Rhythm (Go Baby Go)\n", + "... ... ...\n", + "1812466 101478482 На лицо\n", + "1812467 101490148 Без капли мысли\n", + "1812468 101493057 SKITTLES\n", + "1812469 101495927 Москва\n", + "1812470 101521819 Вокзал\n", + "\n", + "[1000000 rows x 2 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog_tracks = catalog.loc[catalog.type=='track'][['id', 'name']]\n", + "catalog_tracks.to_parquet('../data/catalog_tracks.parquet')\n", + "catalog_tracks\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e53cf75b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idname
8123050all
8123061eastern
8123072rusrock
8123083rusrap
8123094postrock
.........
8124661182balkan
8124671197experimental
8124681370europop
8124691484meditation
8124701542asiapop
\n", + "

166 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " id name\n", + "812305 0 all\n", + "812306 1 eastern\n", + "812307 2 rusrock\n", + "812308 3 rusrap\n", + "812309 4 postrock\n", + "... ... ...\n", + "812466 1182 balkan\n", + "812467 1197 experimental\n", + "812468 1370 europop\n", + "812469 1484 meditation\n", + "812470 1542 asiapop\n", + "\n", + "[166 rows x 2 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog_genres = catalog.loc[catalog.type=='genre'][['id', 'name']]\n", + "catalog_genres.to_parquet('../data/catalog_genres.parquet')\n", + "catalog_genres" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "f261c887", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idname
03Taller Children
112Wild Young Hearts
213Lonesome Crow
317Graffiti Soul
426Blues Six Pack
.........
65871921458141The Lazy Singles
65872021458207Jackie Mittoo Anthology
65872121458968Master Composers: Johann Sebastian Bach
65872221459622Take the Money and Run
65872321461648Tropical
\n", + "

658724 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " id name\n", + "0 3 Taller Children\n", + "1 12 Wild Young Hearts\n", + "2 13 Lonesome Crow\n", + "3 17 Graffiti Soul\n", + "4 26 Blues Six Pack\n", + "... ... ...\n", + "658719 21458141 The Lazy Singles\n", + "658720 21458207 Jackie Mittoo Anthology\n", + "658721 21458968 Master Composers: Johann Sebastian Bach\n", + "658722 21459622 Take the Money and Run\n", + "658723 21461648 Tropical\n", + "\n", + "[658724 rows x 2 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog_albums = catalog.loc[catalog.type=='album'][['id', 'name']]\n", + "catalog_albums.to_parquet('../data/catalog_albums.parquet')\n", + "catalog_albums" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d5095297", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idname
6587244Kenny Dorham
6587255Max Roach
6587267Francis Rossi
6587279Status Quo
65872812Phil Everly
.........
81230016093680Los Tiburones
81230116097398AMELI
812302160984452GANGSTA
81230316099125Daria
81230416102782Pan dö Baré
\n", + "

153581 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " id name\n", + "658724 4 Kenny Dorham\n", + "658725 5 Max Roach\n", + "658726 7 Francis Rossi\n", + "658727 9 Status Quo\n", + "658728 12 Phil Everly\n", + "... ... ...\n", + "812300 16093680 Los Tiburones\n", + "812301 16097398 AMELI\n", + "812302 16098445 2GANGSTA\n", + "812303 16099125 Daria\n", + "812304 16102782 Pan dö Baré\n", + "\n", + "[153581 rows x 2 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog_artists = catalog.loc[catalog.type=='artist'][['id', 'name']]\n", + "catalog_artists.to_parquet('data/catalog_artists.parquet')\n", + "catalog_artists" + ] + }, + { + "cell_type": "markdown", + "id": "f762beed", + "metadata": {}, + "source": [ + "## Анализ каталогов\n", + "## Genres" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "41225d91", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idname
8123050all
8123061eastern
8123072rusrock
8123083rusrap
8123094postrock
.........
8124661182balkan
8124671197experimental
8124681370europop
8124691484meditation
8124701542asiapop
\n", + "

166 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " id name\n", + "812305 0 all\n", + "812306 1 eastern\n", + "812307 2 rusrock\n", + "812308 3 rusrap\n", + "812309 4 postrock\n", + "... ... ...\n", + "812466 1182 balkan\n", + "812467 1197 experimental\n", + "812468 1370 europop\n", + "812469 1484 meditation\n", + "812470 1542 asiapop\n", + "\n", + "[166 rows x 2 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog_genres" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e3627b2a", + "metadata": {}, + "outputs": [], + "source": [ + "genres_dict = dict(zip(catalog_genres['id'], [0]*catalog.shape[0]))\n", + "\n", + "for k,v in tracks.iterrows():\n", + " genres = v['genres']\n", + " for g in genres:\n", + " genres_dict[g] += 1\n", + "genres_df = pd.DataFrame.from_dict(genres_dict, orient='index')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "df97b13b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcountname
000all
111186eastern
2236649rusrock
3365958rusrap
442054postrock
............
1611182454balkan
16211971674experimental
16313700europop
16414841606meditation
1651542176asiapop
\n", + "

166 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " id count name\n", + "0 0 0 all\n", + "1 1 1186 eastern\n", + "2 2 36649 rusrock\n", + "3 3 65958 rusrap\n", + "4 4 2054 postrock\n", + ".. ... ... ...\n", + "161 1182 454 balkan\n", + "162 1197 1674 experimental\n", + "163 1370 0 europop\n", + "164 1484 1606 meditation\n", + "165 1542 176 asiapop\n", + "\n", + "[166 rows x 3 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genres_df.reset_index(inplace=True)\n", + "genres_df.columns=['id','count']\n", + "genres_df['name'] = genres_df['id'].apply(lambda x: catalog_genres.loc[catalog_genres['id'] == x]['name'].values[0] )\n", + "genres_df" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "bd810c63", + "metadata": {}, + "outputs": [], + "source": [ + "catalog_genres = catalog_genres.merge(genres_df[['count','id']], on='id')\n", + "catalog_genres.to_parquet('data/catalog_genres.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "3fe7c8ae", + "metadata": {}, + "source": [ + "## Tracks" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "faebd87d", + "metadata": {}, + "outputs": [], + "source": [ + "tracks_dict = dict(zip(catalog_tracks['id'], [0]*catalog_tracks.shape[0]))\n", + "tracks = tracks.merge(catalog_tracks, left_on='track_id', right_on='id')\n", + "tracks = tracks.merge(tracks_grouped, on='track_id')\n", + "tracks = tracks[['track_id', 'albums', 'artists', 'genres', 'name_x', 'user_id_count']]\n", + "tracks.rename(columns={'user_id_count':'count'}, inplace=True)\n", + "tracks.rename(columns={'name_x':'name'}, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "9bdda605", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
track_idalbumsartistsgenresnamecount
026[3, 2490753][16][11, 21]Complimentary Me5
138[3, 2490753][16][11, 21]Momma's Boy8
2135[12, 214, 2490809][84][11]Atticus16
3136[12, 214, 2490809][84][11]24 Hours7
4138[12, 214, 322, 72275, 72292, 91199, 213505, 24...[84][11]Don't Upset The Rhythm (Go Baby Go)17
.....................
932659101478482[21399811][5540395][3, 75]На лицо6
932660101490148[21403052][9078726][11, 20]Без капли мысли72
932661101493057[21403883][11865715][44, 75]SKITTLES9
932662101495927[21404975][4462686][3, 75]Москва20
932663101521819[21414638][5056591][3, 75]Вокзал34
\n", + "

932664 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " track_id albums \\\n", + "0 26 [3, 2490753] \n", + "1 38 [3, 2490753] \n", + "2 135 [12, 214, 2490809] \n", + "3 136 [12, 214, 2490809] \n", + "4 138 [12, 214, 322, 72275, 72292, 91199, 213505, 24... \n", + "... ... ... \n", + "932659 101478482 [21399811] \n", + "932660 101490148 [21403052] \n", + "932661 101493057 [21403883] \n", + "932662 101495927 [21404975] \n", + "932663 101521819 [21414638] \n", + "\n", + " artists genres name count \n", + "0 [16] [11, 21] Complimentary Me 5 \n", + "1 [16] [11, 21] Momma's Boy 8 \n", + "2 [84] [11] Atticus 16 \n", + "3 [84] [11] 24 Hours 7 \n", + "4 [84] [11] Don't Upset The Rhythm (Go Baby Go) 17 \n", + "... ... ... ... ... \n", + "932659 [5540395] [3, 75] На лицо 6 \n", + "932660 [9078726] [11, 20] Без капли мысли 72 \n", + "932661 [11865715] [44, 75] SKITTLES 9 \n", + "932662 [4462686] [3, 75] Москва 20 \n", + "932663 [5056591] [3, 75] Вокзал 34 \n", + "\n", + "[932664 rows x 6 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tracks" + ] + }, + { + "cell_type": "markdown", + "id": "cddebf8e", + "metadata": {}, + "source": [ + "## Artists" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "fdd805b2", + "metadata": {}, + "outputs": [], + "source": [ + "artists_dict = dict(zip(catalog_artists['id'], [0]*catalog_artists.shape[0]))\n", + "\n", + "for k,v in tracks.iterrows():\n", + " artists = v['artists']\n", + " for g in artists:\n", + " artists_dict[g] += 1\n", + "artists_df = pd.DataFrame.from_dict(artists_dict, orient='index')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "064f70d0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcountname
041Kenny Dorham
151Max Roach
271Francis Rossi
39182Status Quo
4121Phil Everly
............
153576160936804Los Tiburones
153577160973982AMELI
1535781609844512GANGSTA
153579160991251Daria
153580161027821Pan dö Baré
\n", + "

153581 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " id count name\n", + "0 4 1 Kenny Dorham\n", + "1 5 1 Max Roach\n", + "2 7 1 Francis Rossi\n", + "3 9 182 Status Quo\n", + "4 12 1 Phil Everly\n", + "... ... ... ...\n", + "153576 16093680 4 Los Tiburones\n", + "153577 16097398 2 AMELI\n", + "153578 16098445 1 2GANGSTA\n", + "153579 16099125 1 Daria\n", + "153580 16102782 1 Pan dö Baré\n", + "\n", + "[153581 rows x 3 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "artists_df.reset_index(inplace=True)\n", + "artists_df.columns=['id','count']\n", + "artists_df['name'] = artists_df['id'].apply(lambda x: catalog_artists.loc[catalog_artists['id'] == x]['name'].values[0] )\n", + "artists_df" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "04e2830a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idcountname
139301187082038Владимир Высоцкий
261711363сборник
151381398961273Armin van Buuren
209691701201Wolfgang Amadeus Mozart
32912271101Johann Sebastian Bach
2570118181045Hans Zimmer
41115511030Pyotr Ilyich Tchaikovsky
242610987973Elvis Presley
18025188963966Аквариум
16706164416919Михаил Шуфутинский
\n", + "
" + ], + "text/plain": [ + " id count name\n", + "13930 118708 2038 Владимир Высоцкий\n", + "26 171 1363 сборник\n", + "15138 139896 1273 Armin van Buuren\n", + "2096 9170 1201 Wolfgang Amadeus Mozart\n", + "329 1227 1101 Johann Sebastian Bach\n", + "2570 11818 1045 Hans Zimmer\n", + "411 1551 1030 Pyotr Ilyich Tchaikovsky\n", + "2426 10987 973 Elvis Presley\n", + "18025 188963 966 Аквариум\n", + "16706 164416 919 Михаил Шуфутинский" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "artists_df.sort_values(by='count', ascending=False).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "57e71559", + "metadata": {}, + "outputs": [], + "source": [ + "catalog_artists = catalog_artists.merge(artists_df[['count','id']], on='id')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "5dab64e2", + "metadata": {}, + "outputs": [], + "source": [ + "catalog_artists.to_parquet('data/catalog_artists.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "318b573a-9e2d-4808-95db-60cfb8bbdb73", + "metadata": { + "slideshow": { + "slide_type": "" + }, + "tags": [] + }, + "source": [ + "# Выводы" + ] + }, + { + "cell_type": "markdown", + "id": "caa96e12-36a8-4401-8f11-98627a49ae9d", + "metadata": {}, + "source": [ + "Приведём выводы по первому знакомству с данными:\n", + "- есть ли с данными явные проблемы,\n", + "1) Огромная таблица взаимодействий - 220 млн записей.\n", + "1) Представление жанров, артистов и альбомов списками, что усложнит обработку\n", + "\n", + "\n", + "- какие корректирующие действия (в целом) были предприняты.\n", + "1) Были удалены треки без авторов, жанров и альбома\n", + "1) Удалены треки, у которых хотя бы один жанр был неизвестным\n", + "1) Удалены взаимодействия с треками, которых нет в tracks\n", + "1) Удалены из взаимодействий пользователи с количеством прослушиваний менее 3. Для них имеет смысл предсказывать только онлайн " + ] + }, + { + "cell_type": "markdown", + "id": "7bc3296b-eba6-4333-a78d-b9304aa87e3d", + "metadata": {}, + "source": [ + "# === ЭТАП 2 ===" + ] + }, + { + "cell_type": "markdown", + "id": "68e73960-fd38-4e15-8db0-9a25c35dfd25", + "metadata": {}, + "source": [ + "# EDA" + ] + }, + { + "cell_type": "markdown", + "id": "a30e823e-8e0f-4a76-a02e-8d1ba8bf0f8a", + "metadata": {}, + "source": [ + "Распределение количества прослушанных треков." + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "9bf5eaba-35f7-4da7-be59-9ab4a34b2423", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([3.0000e+00, 6.0000e+00, 2.5500e+02, 4.9647e+04, 7.2378e+04,\n", + " 5.6896e+04, 4.6211e+04, 3.8646e+04, 3.3147e+04, 2.8957e+04,\n", + " 2.5255e+04, 2.2636e+04, 2.0247e+04, 1.8258e+04, 1.6701e+04,\n", + " 1.5122e+04, 1.4046e+04, 1.2963e+04, 1.2195e+04, 1.1213e+04,\n", + " 1.0359e+04, 9.5520e+03, 9.1130e+03, 8.6450e+03, 8.1950e+03,\n", + " 7.6020e+03, 7.0220e+03, 6.9950e+03, 6.4610e+03, 6.2610e+03,\n", + " 5.8450e+03, 5.7380e+03, 5.4560e+03, 5.1840e+03, 4.9350e+03,\n", + " 4.7770e+03, 4.7050e+03, 4.4300e+03, 4.2880e+03, 4.1730e+03,\n", + " 3.8810e+03, 3.8060e+03, 3.6740e+03, 3.5720e+03, 3.3860e+03,\n", + " 3.3100e+03, 3.2350e+03, 3.2640e+03, 2.9770e+03, 0.0000e+00,\n", + " 3.0450e+03, 2.8090e+03, 2.7330e+03, 2.7090e+03, 2.6730e+03,\n", + " 2.6670e+03, 2.5620e+03, 2.3920e+03, 2.3700e+03, 2.3370e+03,\n", + " 2.2830e+03, 2.1520e+03, 2.1690e+03, 2.0780e+03, 2.1080e+03,\n", + " 2.1130e+03, 1.9710e+03, 1.9290e+03, 1.8570e+03, 1.7640e+03,\n", + " 1.7840e+03, 1.7810e+03, 1.6900e+03, 1.7460e+03, 1.6840e+03,\n", + " 1.5710e+03, 1.6040e+03, 1.5720e+03, 1.6200e+03, 1.4670e+03,\n", + " 1.4740e+03, 1.4300e+03, 1.4960e+03, 1.4090e+03, 1.3540e+03,\n", + " 1.4030e+03, 1.3710e+03, 1.3610e+03, 1.3190e+03, 1.3210e+03,\n", + " 1.2080e+03, 1.2230e+03, 1.1540e+03, 1.1670e+03, 1.1550e+03,\n", + " 1.1800e+03, 1.0870e+03, 1.0960e+03, 1.0900e+03, 1.0600e+03]),\n", + " array([ 2. , 2.98, 3.96, 4.94, 5.92, 6.9 , 7.88, 8.86,\n", + " 9.84, 10.82, 11.8 , 12.78, 13.76, 14.74, 15.72, 16.7 ,\n", + " 17.68, 18.66, 19.64, 20.62, 21.6 , 22.58, 23.56, 24.54,\n", + " 25.52, 26.5 , 27.48, 28.46, 29.44, 30.42, 31.4 , 32.38,\n", + " 33.36, 34.34, 35.32, 36.3 , 37.28, 38.26, 39.24, 40.22,\n", + " 41.2 , 42.18, 43.16, 44.14, 45.12, 46.1 , 47.08, 48.06,\n", + " 49.04, 50.02, 51. , 51.98, 52.96, 53.94, 54.92, 55.9 ,\n", + " 56.88, 57.86, 58.84, 59.82, 60.8 , 61.78, 62.76, 63.74,\n", + " 64.72, 65.7 , 66.68, 67.66, 68.64, 69.62, 70.6 , 71.58,\n", + " 72.56, 73.54, 74.52, 75.5 , 76.48, 77.46, 78.44, 79.42,\n", + " 80.4 , 81.38, 82.36, 83.34, 84.32, 85.3 , 86.28, 87.26,\n", + " 88.24, 89.22, 90.2 , 91.18, 92.16, 93.14, 94.12, 95.1 ,\n", + " 96.08, 97.06, 98.04, 99.02, 100. ]),\n", + " )" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(tracks.query('count <= 100')['count'], bins=100)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "0ede50ad", + "metadata": {}, + "outputs": [], + "source": [ + "import seaborn as sns" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "3bd4b553", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0, 'Tracks')" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.histplot(tracks.query('count >= 20000')['count'].fillna(0))\n", + "plt.xlabel('Tracks')" + ] + }, + { + "cell_type": "markdown", + "id": "d765519a-18dd-4d30-9e29-cc2d84cacd79", + "metadata": {}, + "source": [ + "Наиболее популярные треки" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "aecaf630-fde0-4860-b84a-42c933a9606e", + "metadata": {}, + "outputs": [], + "source": [ + "tracks[['track_id','count','name']].sort_values(by='count', ascending=False).head(10)\n", + "catalog_tracks = catalog_tracks.merge(tracks[['count','track_id']], left_on='id', right_on='track_id')\n", + "catalog_tracks.drop(columns=['track_id'], inplace=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "89335ddd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecount_xcount_ycount
821553404Smells Like Teen Spirit111062111062111062
45166433311009Believer106921106921106921
24433178529Numb101924101924101924
47876535505245I Got Love994909949099490
77416965851540Юность866708667086670
34129524692821Way Down We Go862468624686246
44334732947997Shape of You858868588685886
65181051241318In The End852448524485244
83809795836Shape Of My Heart850428504285042
60754245499814Life847488474884748
\n", + "
" + ], + "text/plain": [ + " id name count_x count_y count\n", + "8215 53404 Smells Like Teen Spirit 111062 111062 111062\n", + "451664 33311009 Believer 106921 106921 106921\n", + "24433 178529 Numb 101924 101924 101924\n", + "478765 35505245 I Got Love 99490 99490 99490\n", + "774169 65851540 Юность 86670 86670 86670\n", + "341295 24692821 Way Down We Go 86246 86246 86246\n", + "443347 32947997 Shape of You 85886 85886 85886\n", + "651810 51241318 In The End 85244 85244 85244\n", + "83809 795836 Shape Of My Heart 85042 85042 85042\n", + "607542 45499814 Life 84748 84748 84748" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog_tracks.sort_values('count', ascending=False).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "da18b206", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_219026/1451962387.py:2: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.\n", + " p.set_xticklabels(\n" + ] + }, + { + "data": { + "text/plain": [ + "[Text(0, 0, 'Smells Like Teen Spirit'),\n", + " Text(1, 0, 'Believer'),\n", + " Text(2, 0, 'Numb'),\n", + " Text(3, 0, 'I Got Love'),\n", + " Text(4, 0, 'Юность'),\n", + " Text(5, 0, 'Way Down We Go'),\n", + " Text(6, 0, 'Shape of You'),\n", + " Text(7, 0, 'In The End'),\n", + " Text(8, 0, 'Shape Of My Heart'),\n", + " Text(9, 0, 'Life')]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "p = sns.barplot(catalog_tracks.sort_values('count', ascending=False).head(10),x='name', y='count')\n", + "p.set_xticklabels(\n", + " p.get_xticklabels(), \n", + " rotation=45, \n", + " horizontalalignment='right'\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "3b0bff35", + "metadata": {}, + "outputs": [], + "source": [ + "catalog_tracks.to_parquet('../data/catalog_tracks.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "16e6863f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(array([ 148201., 192358., 172134., 268240., 311700., 262459.,\n", + " 387518., 431634., 351026., 505503., 549948., 442020.,\n", + " 624725., 498827., 702108., 747713., 592265., 826998.,\n", + " 870964., 683504., 950577., 996923., 777570., 1074763.,\n", + " 839507., 1157846., 1207839., 936417., 1289020., 1340665.,\n", + " 1036658., 1425276., 1475006., 1139436., 1561318., 1614665.,\n", + " 1241160., 1698673., 1311020., 1790139., 1842043., 1414538.,\n", + " 1933496., 1989027., 1527155., 2081109., 2138155., 1637404.,\n", + " 2233059., 1708755., 2324778., 2387956., 1824282., 2481403.,\n", + " 2542194., 1945960., 2644429., 2704845., 2068454., 2809762.,\n", + " 2873336., 2195813., 2983642., 2278917., 3093772., 3158010.,\n", + " 2408102., 3269353., 3334345., 2545551., 3454690., 3520903.,\n", + " 2685108., 3640694., 2771240., 3751363., 3817447., 2906555.,\n", + " 3933520., 3998960., 3035922., 4105882., 4164647., 3159655.,\n", + " 4261388., 4319254., 3273782., 4400208., 3328987., 4468841.,\n", + " 4495965., 3374974., 4495049., 4464815., 3306909., 4312803.,\n", + " 4140545., 2908108., 3418731., 1983502.]),\n", + " array([18993. , 18996.64, 19000.28, 19003.92, 19007.56, 19011.2 ,\n", + " 19014.84, 19018.48, 19022.12, 19025.76, 19029.4 , 19033.04,\n", + " 19036.68, 19040.32, 19043.96, 19047.6 , 19051.24, 19054.88,\n", + " 19058.52, 19062.16, 19065.8 , 19069.44, 19073.08, 19076.72,\n", + " 19080.36, 19084. , 19087.64, 19091.28, 19094.92, 19098.56,\n", + " 19102.2 , 19105.84, 19109.48, 19113.12, 19116.76, 19120.4 ,\n", + " 19124.04, 19127.68, 19131.32, 19134.96, 19138.6 , 19142.24,\n", + " 19145.88, 19149.52, 19153.16, 19156.8 , 19160.44, 19164.08,\n", + " 19167.72, 19171.36, 19175. , 19178.64, 19182.28, 19185.92,\n", + " 19189.56, 19193.2 , 19196.84, 19200.48, 19204.12, 19207.76,\n", + " 19211.4 , 19215.04, 19218.68, 19222.32, 19225.96, 19229.6 ,\n", + " 19233.24, 19236.88, 19240.52, 19244.16, 19247.8 , 19251.44,\n", + " 19255.08, 19258.72, 19262.36, 19266. , 19269.64, 19273.28,\n", + " 19276.92, 19280.56, 19284.2 , 19287.84, 19291.48, 19295.12,\n", + " 19298.76, 19302.4 , 19306.04, 19309.68, 19313.32, 19316.96,\n", + " 19320.6 , 19324.24, 19327.88, 19331.52, 19335.16, 19338.8 ,\n", + " 19342.44, 19346.08, 19349.72, 19353.36, 19357. ]),\n", + " )" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(interactions['started_at'], bins=100)" + ] + }, + { + "cell_type": "markdown", + "id": "b1c32a5a-d3be-4f96-8dd9-f7860951020c", + "metadata": {}, + "source": [ + "Наиболее популярные жанры" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "1bc50491-9235-4d3c-a6c2-297f7c05a959", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamecount
1111pop166109
7575rap128206
102102allrock118461
6868electronics106478
33rusrap65958
4444foreignrap59772
1414rock55148
1616dance51595
2020ruspop46706
1313alternative42894
\n", + "
" + ], + "text/plain": [ + " id name count\n", + "11 11 pop 166109\n", + "75 75 rap 128206\n", + "102 102 allrock 118461\n", + "68 68 electronics 106478\n", + "3 3 rusrap 65958\n", + "44 44 foreignrap 59772\n", + "14 14 rock 55148\n", + "16 16 dance 51595\n", + "20 20 ruspop 46706\n", + "13 13 alternative 42894" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "catalog_genres = pd.read_parquet('../data/catalog_genres.parquet')\n", + "catalog_genres.sort_values('count', ascending=False).head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "0d850a07-ef1e-462f-891a-1cf89f2e24ef", + "metadata": {}, + "source": [ + "# Преобразование данных" + ] + }, + { + "cell_type": "markdown", + "id": "fabcf8d2-1192-4df5-b20b-fbb84689f57a", + "metadata": {}, + "source": [ + "Преобразуем данные в формат, более пригодный для дальнейшего использования в расчётах рекомендаций." + ] + }, + { + "cell_type": "markdown", + "id": "72ecbbed-c560-44d9-9c14-86c7dc76f399", + "metadata": {}, + "source": [ + "# Очистка памяти" + ] + }, + { + "cell_type": "markdown", + "id": "b5358ede-ba6e-4c4f-bd73-5b9344f0ba79", + "metadata": {}, + "source": [ + "Здесь, может понадобится очистка памяти для высвобождения ресурсов для выполнения кода ниже. \n", + "\n", + "Приведите соответствующие код, комментарии, например:\n", + "- код для удаление более ненужных переменных,\n", + "- комментарий, что следует перезапустить kernel, выполнить такие-то начальные секции и продолжить с этапа 3." + ] + }, + { + "cell_type": "markdown", + "id": "a0c64390", + "metadata": {}, + "source": [ + "Следует перезапустить kernel, выполнять действия начиная с 3го этапа" + ] + }, + { + "cell_type": "markdown", + "id": "5b4e87cc", + "metadata": {}, + "source": [] + }, + { + "cell_type": "markdown", + "id": "075fd983", + "metadata": {}, + "source": [ + "### ВЫВОДЫ:\n", + "\n", + "Больше всего треков, которые слушают менее 20 раз. Вероятно, их можно убрать из выборки для существенного сокращения размера (делать мы это конечно же не будем)\n", + "\n", + "Треки из топ-10 находятся в моем сердечке :)\n", + "\n", + "Топ жанров возглавляет поп, рэп, рок и электроника, остальные идут с большим отрывом в полтора-два раза по количеству треков" + ] + }, + { + "cell_type": "markdown", + "id": "708503df-ee89-4cf3-8489-093dc478e2a8", + "metadata": {}, + "source": [ + "# === ЭТАП 3 ===" + ] + }, + { + "cell_type": "markdown", + "id": "fd77de22-e10f-4b42-85c1-8fb6f805fe68", + "metadata": {}, + "source": [ + "# Загрузка данных" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b255df91", + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import random\n", + "import scipy\n", + "import sklearn.preprocessing\n", + "from catboost import CatBoostClassifier, Pool" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f19fc8a5-bd2c-40d7-864a-ee75aca6d512", + "metadata": {}, + "outputs": [], + "source": [ + "items = pd.read_parquet('../data/items.parquet')\n", + "interactions = pd.read_parquet('../data/events.parquet')\n", + "#catalog = pd.read_parquet('data/catalog_names.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9b4ef95a", + "metadata": {}, + "outputs": [], + "source": [ + "catalog_genres = pd.read_parquet('../data/catalog_genres.parquet')\n", + "catalog_albums = pd.read_parquet('../data/catalog_albums.parquet')\n", + "catalog_artists = pd.read_parquet('../data/catalog_artists.parquet')\n", + "catalog_tracks = pd.read_parquet('../data/catalog_tracks.parquet')" + ] + }, + { + "cell_type": "markdown", + "id": "a694c023-6477-490b-939d-1cfa6f5f1b72", + "metadata": {}, + "source": [ + "# Разбиение данных" + ] + }, + { + "cell_type": "markdown", + "id": "fbd5f6e0-54e7-4428-8678-eabce505d82c", + "metadata": {}, + "source": [ + "Разбиваем данные на тренировочную, тестовую выборки." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "00c2dfa5-d8a2-47d1-922e-6eefee2c62d1", + "metadata": {}, + "outputs": [], + "source": [ + "train_test_global_time_split_date = pd.to_datetime(\"2022-12-16\")\n", + "interactions_train = interactions.loc[interactions[\"started_at\"] < train_test_global_time_split_date]\n", + "interactions_test = interactions.loc[interactions[\"started_at\"] >= train_test_global_time_split_date]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bb261d5", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# В учебных целях берем 25% от всего, потому что иначе все выполняется бессовестно долго. \n", + "interactions_train = interactions_train.sample(frac=0.25, random_state = 42)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "781f047e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(50299870, 4)\n", + "(13514936, 4)\n" + ] + } + ], + "source": [ + "print(interactions_train.shape)\n", + "print(interactions_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e73cc82", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(13438773, 4)\n" + ] + } + ], + "source": [ + "# Drop items in test that are not in train\n", + "interactions_test = interactions_test.loc[~interactions_test['track_id'].isin(items_diff)]\n", + "print(interactions_test.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "9131c7e6-8852-4556-b510-51f7253cc299", + "metadata": {}, + "source": [ + "# Топ популярных" + ] + }, + { + "cell_type": "markdown", + "id": "dd70d43a-88cc-4719-b291-feaed7136f30", + "metadata": {}, + "source": [ + "Рассчитаем рекомендации как топ популярных." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "ee45e200-b7d6-4f56-9077-aad431689b96", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
track_idcount
79205340427471
2349117852925393
4270953331100925232
4526923550524523657
3230472469282121052
.........
7479586856271111949
7936107564296111884
7947837594493411797
83945773011691
7466486834838911651
\n", + "

100 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " track_id count\n", + "7920 53404 27471\n", + "23491 178529 25393\n", + "427095 33311009 25232\n", + "452692 35505245 23657\n", + "323047 24692821 21052\n", + "... ... ...\n", + "747958 68562711 11949\n", + "793610 75642961 11884\n", + "794783 75944934 11797\n", + "8394 57730 11691\n", + "746648 68348389 11651\n", + "\n", + "[100 rows x 2 columns]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_popularity = interactions_train.groupby('track_id')[['user_id']].count().reset_index().sort_values(by='user_id', ascending=False)[:100]\n", + "item_popularity.columns = ['track_id', 'count']\n", + "item_popularity" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e39319f4", + "metadata": {}, + "outputs": [], + "source": [ + "item_popularity['rank'] = item_popularity['count'].rank(ascending=False)\n", + "item_popularity['rank'] = item_popularity['rank'].astype(int)\n", + "item_popularity = item_popularity.rename(columns= {'track_id': 'item_id'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bacefc16", + "metadata": {}, + "outputs": [], + "source": [ + "item_popularity.to_parquet('../recommendations/top_popular.parquet')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "4cc77f15", + "metadata": {}, + "outputs": [], + "source": [ + "item_popularity = pd.read_parquet('../recommendations/top_popular.parquet')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "705c40a6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_idcountrank
792053404274711
23491178529253932
42709533311009252323
45269235505245236574
32304724692821210525
............
747958685627111194996
793610756429611188497
794783759449341179798
8394577301169199
7466486834838911651100
\n", + "

100 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " item_id count rank\n", + "7920 53404 27471 1\n", + "23491 178529 25393 2\n", + "427095 33311009 25232 3\n", + "452692 35505245 23657 4\n", + "323047 24692821 21052 5\n", + "... ... ... ...\n", + "747958 68562711 11949 96\n", + "793610 75642961 11884 97\n", + "794783 75944934 11797 98\n", + "8394 57730 11691 99\n", + "746648 68348389 11651 100\n", + "\n", + "[100 rows x 3 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "item_popularity" + ] + }, + { + "cell_type": "markdown", + "id": "2ad231f2-6158-421a-b7fa-01d8bc3ed572", + "metadata": {}, + "source": [ + "# Персональные" + ] + }, + { + "cell_type": "markdown", + "id": "86159460-cd9d-4b63-8248-604ea3c9aebf", + "metadata": {}, + "source": [ + "Рассчитаем персональные рекомендации." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "25f19ff2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_155402/3672207653.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " interactions_test['target'] = 1 # Факт прослушивания\n" + ] + } + ], + "source": [ + "interactions_train['target'] = 1 # Факт прослушивания\n", + "interactions_test['target'] = 1 # Факт прослушивания" + ] + }, + { + "cell_type": "markdown", + "id": "af7a2e3c", + "metadata": {}, + "source": [ + "## ALS" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "51edbd36", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_155402/3892860961.py:7: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " interactions_test[\"user_id_enc\"] = user_encoder.transform(interactions_test[\"user_id\"])\n", + "/tmp/ipykernel_155402/3892860961.py:15: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " interactions_test[\"track_id_enc\"] = item_encoder.transform(interactions_test[\"track_id\"])\n" + ] + } + ], + "source": [ + "user_encoder = sklearn.preprocessing.LabelEncoder()\n", + "\n", + "# перекодируем идентификаторы пользователей: \n", + "# из имеющихся в последовательность 0, 1, 2, ...\n", + "user_encoder.fit(interactions[\"user_id\"])\n", + "interactions_train[\"user_id_enc\"] = user_encoder.transform(interactions_train[\"user_id\"])\n", + "interactions_test[\"user_id_enc\"] = user_encoder.transform(interactions_test[\"user_id\"])\n", + "\n", + "# перекодируем идентификаторы объектов: \n", + "# из имеющихся в последовательность 0, 1, 2, ...\n", + "item_encoder = sklearn.preprocessing.LabelEncoder()\n", + "item_encoder.fit(items[\"track_id\"])\n", + "items[\"track_id_enc\"] = item_encoder.transform(items[\"track_id\"])\n", + "interactions_train[\"track_id_enc\"] = item_encoder.transform(interactions_train[\"track_id\"])\n", + "interactions_test[\"track_id_enc\"] = item_encoder.transform(interactions_test[\"track_id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d2dd9c63", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idtrack_idtrack_seqstarted_attargetuser_id_enctrack_id_enc
2731311288658912712742022-12-0311305663774484
2411182663351129572422022-10-1611177607473510
134785243890849981352022-12-141781869899346
1417291443117152022-06-0811721806832
2057227520003416212022-12-081569812285162
........................
621105625921895186222022-03-1511051720111587
382278471233440683832022-04-021277280321471
136682319593737151372022-11-111679358721033
5955372954859162022-08-051951241196517
896689124277598858972022-06-251686139376419
\n", + "

50299870 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " user_id track_id track_seq started_at target user_id_enc \\\n", + "273 1311288 65891271 274 2022-12-03 1 1305663 \n", + "241 1182663 35112957 242 2022-10-16 1 1177607 \n", + "134 785243 89084998 135 2022-12-14 1 781869 \n", + "14 172914 43117 15 2022-06-08 1 172180 \n", + "20 572275 20003416 21 2022-12-08 1 569812 \n", + ".. ... ... ... ... ... ... \n", + "621 1056259 2189518 622 2022-03-15 1 1051720 \n", + "382 278471 23344068 383 2022-04-02 1 277280 \n", + "136 682319 59373715 137 2022-11-11 1 679358 \n", + "5 955372 9548591 6 2022-08-05 1 951241 \n", + "896 689124 27759885 897 2022-06-25 1 686139 \n", + "\n", + " track_id_enc \n", + "273 774484 \n", + "241 473510 \n", + "134 899346 \n", + "14 6832 \n", + "20 285162 \n", + ".. ... \n", + "621 111587 \n", + "382 321471 \n", + "136 721033 \n", + "5 196517 \n", + "896 376419 \n", + "\n", + "[50299870 rows x 7 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "interactions_train\n" + ] + }, + { + "cell_type": "markdown", + "id": "aaf9dca7", + "metadata": {}, + "source": [ + "CSR matrix\n", + "\n", + "https://matteding.github.io/images/csr.gif" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "75313c81", + "metadata": {}, + "outputs": [], + "source": [ + "user_item_matrix_train = scipy.sparse.csr_matrix((\n", + " interactions_train[\"target\"],\n", + " (interactions_train['user_id_enc'], interactions_train['track_id_enc'])),\n", + " dtype=np.int8) " + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ac801f85", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/andrey/work/institute/MLE/assets/recsys/.venv_recsys/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/andrey/work/institute/MLE/assets/recsys/.venv_recsys/lib/python3.10/site-packages/implicit/cpu/als.py:95: RuntimeWarning: OpenBLAS is configured to use 12 threads. It is highly recommended to disable its internal threadpool by setting the environment variable 'OPENBLAS_NUM_THREADS=1' or by calling 'threadpoolctl.threadpool_limits(1, \"blas\")'. Having OpenBLAS use a threadpool can lead to severe performance issues here.\n", + " check_blas_config()\n", + "100%|██████████| 30/30 [02:48<00:00, 5.61s/it]\n" + ] + } + ], + "source": [ + "from implicit.als import AlternatingLeastSquares\n", + "\n", + "als_model = AlternatingLeastSquares(factors=30, iterations=30, regularization=0.05, random_state=0)\n", + "als_model.fit(user_item_matrix_train) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98ce57e3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_recommendations_als(user_item_matrix, model, user_id, user_encoder, item_encoder, include_seen=False, n=5):\n", + " \"\"\"\n", + " Возвращает отранжированные рекомендации для заданного пользователя\n", + " \"\"\"\n", + " user_id_enc = user_encoder.transform([user_id])[0]\n", + " recommendations = model.recommend(\n", + " user_id_enc, \n", + " user_item_matrix[user_id_enc], \n", + " filter_already_liked_items = not include_seen,\n", + " N=n)\n", + " recommendations = pd.DataFrame({\"item_id_enc\": recommendations[0], \"score\": recommendations[1]})\n", + " recommendations[\"item_id\"] = item_encoder.inverse_transform(recommendations[\"item_id_enc\"])\n", + " \n", + " return recommendations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e39e5237", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "887bf440", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "user_id: 39211\n", + "История (последние события)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idtrack_idtrack_seqstarted_attargetuser_id_enctrack_id_encnamegenres
5639211339774391432022-11-14139046459580Feel It Still[70]
573921115410295892022-11-03139046229667Танцуй, пока молодая[11, 20]
583921114293633852022-11-03139046219139Trebles 2013[16]
5939211371003701562022-11-16139046502447Белая ночь[11, 20]
6039211320430931332022-11-11139046428825Mr. Vain Recall[16]
6139211878664372022-10-2413904689718Y Si No Existieras (Y Si No Has De Volver) (Et...[325]
6239211674649592822022-12-11139046784874Побуяним[11, 20]
6339211219986921082022-11-07139046313896Don't Think I Will Forgive You[68]
6439211627938812672022-12-07139046748758Kanyelele[68]
65392111710808472022-10-26139046103882Bohemian Rhapsody[14, 17, 25, 102]
\n", + "
" + ], + "text/plain": [ + " user_id track_id track_seq started_at target user_id_enc \\\n", + "56 39211 33977439 143 2022-11-14 1 39046 \n", + "57 39211 15410295 89 2022-11-03 1 39046 \n", + "58 39211 14293633 85 2022-11-03 1 39046 \n", + "59 39211 37100370 156 2022-11-16 1 39046 \n", + "60 39211 32043093 133 2022-11-11 1 39046 \n", + "61 39211 878664 37 2022-10-24 1 39046 \n", + "62 39211 67464959 282 2022-12-11 1 39046 \n", + "63 39211 21998692 108 2022-11-07 1 39046 \n", + "64 39211 62793881 267 2022-12-07 1 39046 \n", + "65 39211 1710808 47 2022-10-26 1 39046 \n", + "\n", + " track_id_enc name \\\n", + "56 459580 Feel It Still \n", + "57 229667 Танцуй, пока молодая \n", + "58 219139 Trebles 2013 \n", + "59 502447 Белая ночь \n", + "60 428825 Mr. Vain Recall \n", + "61 89718 Y Si No Existieras (Y Si No Has De Volver) (Et... \n", + "62 784874 Побуяним \n", + "63 313896 Don't Think I Will Forgive You \n", + "64 748758 Kanyelele \n", + "65 103882 Bohemian Rhapsody \n", + "\n", + " genres \n", + "56 [70] \n", + "57 [11, 20] \n", + "58 [16] \n", + "59 [11, 20] \n", + "60 [16] \n", + "61 [325] \n", + "62 [11, 20] \n", + "63 [68] \n", + "64 [68] \n", + "65 [14, 17, 25, 102] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "user_id = interactions_train['user_id'].sample().iat[0]\n", + "\n", + "print(f\"user_id: {user_id}\")\n", + "\n", + "print(\"История (последние события)\")\n", + "user_history = (\n", + " interactions_train\n", + " .query(\"user_id == @user_id\")\n", + " .merge(items.set_index(\"track_id\")[[\"name\", \"genres\"]], on=\"track_id\")\n", + ")\n", + "user_history_to_print = user_history.tail(10)\n", + "display(user_history_to_print)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "13e662c5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Рекомендации\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_id_encscoreitem_idnamegenres
07741690.51117865851540Юность[11, 20]
15348470.25497539257277In My Mind[16]
24433470.23857832947997Shape of You[11]
36634910.18402652380688Любимка[11, 20]
47282750.17999260292250Blinding Lights[74]
\n", + "
" + ], + "text/plain": [ + " item_id_enc score item_id name genres\n", + "0 774169 0.511178 65851540 Юность [11, 20]\n", + "1 534847 0.254975 39257277 In My Mind [16]\n", + "2 443347 0.238578 32947997 Shape of You [11]\n", + "3 663491 0.184026 52380688 Любимка [11, 20]\n", + "4 728275 0.179992 60292250 Blinding Lights [74]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "print(\"Рекомендации\")\n", + "user_recommendations_als = get_recommendations_als(user_item_matrix_train, als_model, user_id, user_encoder, item_encoder, include_seen=True)\n", + "user_recommendations_als = user_recommendations_als.merge(items.set_index(\"track_id\")[[\"name\", \"genres\"]], left_on=\"item_id\", right_on=\"track_id\")\n", + "\n", + "display(user_recommendations_als)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9ab3670e", + "metadata": {}, + "outputs": [], + "source": [ + "# !!! 45 minutes \n", + "# чтобы не ждать - переместиться на строку считывания parquet\n", + "\n", + "# получаем список всех возможных user_id (перекодированных)\n", + "user_ids_encoded = range(len(user_encoder.classes_)-1)\n", + "\n", + "# получаем рекомендации для всех по+льзователей \n", + "\n", + "\n", + "als_recommendations = als_model.recommend(\n", + " user_ids_encoded, \n", + " user_item_matrix_train[user_ids_encoded], \n", + " filter_already_liked_items=False, N=30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c32a23c0", + "metadata": {}, + "outputs": [], + "source": [ + "# преобразуем полученные рекомендации в табличный формат\n", + "item_ids_enc = als_recommendations[0]\n", + "als_scores = als_recommendations[1]\n", + "\n", + "als_recommendations = pd.DataFrame({\n", + " \"user_id_enc\": user_ids_encoded,\n", + " \"item_id_enc\": item_ids_enc.tolist(), \n", + " \"score\": als_scores.tolist()})\n", + "als_recommendations = als_recommendations.explode([\"item_id_enc\", \"score\"], ignore_index=True)\n", + "\n", + "# приводим типы данных\n", + "als_recommendations[\"item_id_enc\"] = als_recommendations[\"item_id_enc\"].astype(\"int\")\n", + "als_recommendations[\"score\"] = als_recommendations[\"score\"].astype(\"float\")\n", + "\n", + "# получаем изначальные идентификаторы\n", + "als_recommendations[\"user_id\"] = user_encoder.inverse_transform(als_recommendations[\"user_id_enc\"])\n", + "als_recommendations[\"item_id\"] = item_encoder.inverse_transform(als_recommendations[\"item_id_enc\"])\n", + "als_recommendations = als_recommendations.drop(columns=[\"user_id_enc\", \"item_id_enc\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "625dd757", + "metadata": {}, + "outputs": [], + "source": [ + "als_recommendations = als_recommendations[[\"user_id\", \"item_id\", \"score\"]]\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15923526", + "metadata": {}, + "outputs": [], + "source": [ + "als_recommendations = als_recommendations.sort_values(by = \"score\", ascending=False)\n", + "als_recommendations.to_parquet(\"recsys/recomendations/personal_als.parquet\") " + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "ff125847", + "metadata": {}, + "outputs": [], + "source": [ + "als_recommendations = pd.read_parquet(\"../recommendations/personal_als.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "5f09dc7e-7c91-4355-860a-b9cfb9f33f15", + "metadata": {}, + "source": [ + "# Похожие" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dfc5d8ba", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# !!!! 40 minutes\n", + "# чтобы не ждать - переместиться на строку считывания parquet\n", + "\n", + "# получим энкодированные идентификаторы всех объектов, известных нам из events_train\n", + "train_item_ids_enc = interactions_train['track_id_enc'].unique()\n", + "\n", + "max_similar_items = 10\n", + "\n", + "# получаем списки похожих объектов, используя ранее полученную ALS-модель\n", + "# метод similar_items возвращает и сам объект, как наиболее похожий\n", + "# этот объект мы позже отфильтруем, но сейчас запросим на 1 больше\n", + "\n", + "\n", + "similar_items = als_model.similar_items(train_item_ids_enc, N=max_similar_items+1)\n", + "\n", + "# преобразуем полученные списки в табличный формат\n", + "sim_item_item_ids_enc = similar_items[0]\n", + "sim_item_scores = similar_items[1]\n", + "\n", + "similar_items = pd.DataFrame({\n", + " \"track_id_enc\": train_item_ids_enc,\n", + " \"sim_item_id_enc\": sim_item_item_ids_enc.tolist(), \n", + " \"score\": sim_item_scores.tolist()})\n", + "similar_items = similar_items.sort_values('score')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62881cf9", + "metadata": {}, + "outputs": [], + "source": [ + "# Надо бы сделать обратную трансформацию, но зависает спустя несколько часов работы.... Пока будем считать, что энкодед это истиные идентификаторы \n", + "# similar_items[\"item_id\"] = similar_items['track_id_enc'].apply(lambda x: item_encoder.inverse_transform([x])[0])\n", + "# similar_items[\"sim_item_id\"] = similar_items['sim_item_id_enc'].apply(lambda x: item_encoder.inverse_transform(x[1:]))\n" + ] + }, + { + "cell_type": "markdown", + "id": "1dfcb683-b440-40a8-9975-894156a53872", + "metadata": {}, + "source": [ + "Рассчитаем похожие, они позже пригодятся для онлайн-рекомендаций." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "a75d07ee-4b12-4ce5-aa85-e45cb7a7a4f0", + "metadata": {}, + "outputs": [], + "source": [ + "similar_items.rename(columns={'track_id_enc':'item_id', 'sim_item_id_enc':'sim_item_id'}, inplace=True)\n", + "similar_items.to_parquet(\"../recommendations/similar_items.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "ce370904-4c49-4152-8706-416074ea9b9a", + "metadata": {}, + "outputs": [], + "source": [ + "similar_items = pd.read_parquet(\"../recommendations/similar_items.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "id": "be54154c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
item_idsim_item_idscore
787744440450[440450, 794964, 662906, 789388][0.9999995231628418, 0.8756284713745117, 0.841...
89069478024[478024, 542933, 200580, 501613][0.9999995231628418, 0.950843870639801, 0.9462...
142892170948[170948, 357991, 184016, 56267][0.9999995231628418, 0.9587958455085754, 0.956...
636868178504[178504, 92743, 206733, 337648][0.9999995231628418, 0.9872869849205017, 0.986...
694565429070[429070, 428326, 289784, 428323][0.9999995827674866, 0.8366211652755737, 0.836...
............
750870752554[752554, 511740, 397649, 718404][1.0000004768371582, 1.0000003576278687, 0.972...
393268346630[346651, 346630, 346622, 346616][1.0000004768371582, 1.0000004768371582, 1.000...
559345346651[346651, 346630, 346622, 346616][1.0000004768371582, 1.0000004768371582, 1.000...
476378346616[346651, 346630, 346622, 346616][1.0000004768371582, 1.0000004768371582, 1.000...
500411346622[346651, 346630, 346622, 346616][1.0000004768371582, 1.0000004768371582, 1.000...
\n", + "

873606 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " item_id sim_item_id \\\n", + "787744 440450 [440450, 794964, 662906, 789388] \n", + "89069 478024 [478024, 542933, 200580, 501613] \n", + "142892 170948 [170948, 357991, 184016, 56267] \n", + "636868 178504 [178504, 92743, 206733, 337648] \n", + "694565 429070 [429070, 428326, 289784, 428323] \n", + "... ... ... \n", + "750870 752554 [752554, 511740, 397649, 718404] \n", + "393268 346630 [346651, 346630, 346622, 346616] \n", + "559345 346651 [346651, 346630, 346622, 346616] \n", + "476378 346616 [346651, 346630, 346622, 346616] \n", + "500411 346622 [346651, 346630, 346622, 346616] \n", + "\n", + " score \n", + "787744 [0.9999995231628418, 0.8756284713745117, 0.841... \n", + "89069 [0.9999995231628418, 0.950843870639801, 0.9462... \n", + "142892 [0.9999995231628418, 0.9587958455085754, 0.956... \n", + "636868 [0.9999995231628418, 0.9872869849205017, 0.986... \n", + "694565 [0.9999995827674866, 0.8366211652755737, 0.836... \n", + "... ... \n", + "750870 [1.0000004768371582, 1.0000003576278687, 0.972... \n", + "393268 [1.0000004768371582, 1.0000004768371582, 1.000... \n", + "559345 [1.0000004768371582, 1.0000004768371582, 1.000... \n", + "476378 [1.0000004768371582, 1.0000004768371582, 1.000... \n", + "500411 [1.0000004768371582, 1.0000004768371582, 1.000... \n", + "\n", + "[873606 rows x 3 columns]" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "similar_items" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1011f608", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "ExecuteTimeLog": [ + { + "duration": 66, + "start_time": "2024-09-02T18:34:52.193Z" + }, + { + "duration": 46, + "start_time": "2024-09-02T18:36:43.983Z" + }, + { + "duration": 55, + "start_time": "2024-09-02T18:40:09.918Z" + }, + { + "duration": 53, + "start_time": "2024-09-02T18:43:19.919Z" + }, + { + "duration": 54, + "start_time": "2024-09-02T18:44:22.195Z" + }, + { + "duration": 53, + "start_time": "2024-09-02T18:45:24.351Z" + }, + { + "duration": 61, + "start_time": "2024-09-02T19:00:18.348Z" + } + ], + "kernelspec": { + "display_name": ".venv_recsys", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": true, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/assets/recsys/service/events/Dockerfile b/assets/recsys/service/events/Dockerfile new file mode 100644 index 0000000..157c63b --- /dev/null +++ b/assets/recsys/service/events/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.11-slim + +COPY . /events_app +WORKDIR /events_app +RUN pip install -r requirements.txt +EXPOSE 8020 + +CMD ["uvicorn", "events_service:app", "--port", "8020", "--host", "0.0.0.0"] + diff --git a/assets/recsys/service/events/events_service.py b/assets/recsys/service/events/events_service.py new file mode 100644 index 0000000..64a7ff7 --- /dev/null +++ b/assets/recsys/service/events/events_service.py @@ -0,0 +1,54 @@ +from fastapi import FastAPI + + +class EventStore: + + def __init__(self, max_events_per_user=10): + self.events = {} + self.max_events_per_user = max_events_per_user + + + def get(self, user_id): + """ + Возвращает события для пользователя + """ + + if user_id in self.events: + user_events = self.events[user_id] + else: + user_events = [] + + return user_events + + + def put(self, user_id, item_id): + """ + Сохраняет событие для пользователя + """ + user_events = self.get(user_id) + self.events[user_id] = [item_id] + user_events[: self.max_events_per_user] + + +events_store = EventStore() + +app = FastAPI(title="events") + +@app.post("/put") +async def put(user_id: int, item_id: int): + """ + Сохраняет событие для user_id, item_id + """ + events_store.put(user_id, item_id) + + return {"result": "ok"} + + +@app.get("/get") +async def get(user_id: int, k: int = 10): + """ + Возвращает список последних k событий для пользователя user_id + """ + events = events_store.get(user_id)[:k] + + return {"events": events} + diff --git a/assets/recsys/service/events/requirements.txt b/assets/recsys/service/events/requirements.txt new file mode 100644 index 0000000..f0615cf --- /dev/null +++ b/assets/recsys/service/events/requirements.txt @@ -0,0 +1,2 @@ +fastapi +uvicorn \ No newline at end of file diff --git a/assets/recsys/service/features/Dockerfile b/assets/recsys/service/features/Dockerfile new file mode 100644 index 0000000..d9eff2e --- /dev/null +++ b/assets/recsys/service/features/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.11-slim + +COPY . /features_app +WORKDIR /features_app +RUN pip install -r requirements.txt + +EXPOSE 8010 + +CMD ["uvicorn", "feature_service:app", "--port", "8010", "--host", "0.0.0.0"] \ No newline at end of file diff --git a/assets/recsys/service/features/feature_service.py b/assets/recsys/service/features/feature_service.py new file mode 100644 index 0000000..0620ca9 --- /dev/null +++ b/assets/recsys/service/features/feature_service.py @@ -0,0 +1,57 @@ +import logging + +import pandas as pd +from fastapi import FastAPI + +PATH_TO_SIMILAR_ITEMS = '../../recommendations/similar_items.parquet' + +logger = logging.getLogger("uvicorn.error") + +class SimilarItems: + + def __init__(self, path, **kwargs): + + """ + Загружаем данные из файла + """ + logger.info(f"Loading data") + self._similar_items = pd.read_parquet(path) + self._similar_items = self._similar_items[kwargs['columns']] + self._similar_items = self._similar_items.set_index('item_id') + logger.info(f"Loaded") + + + def get(self, item_id: int, k: int = 10): + """ + Возвращает список k похожих объектов + """ + + try: + i2i = self._similar_items.loc[item_id].head(k) + i2i = {"item_id_2": i2i["sim_item_id"].tolist(), "score": i2i['score'].tolist()} + except KeyError: + logger.error("No recommendations found") + i2i = {"item_id_2": [], "score": []} + except: + logger.error("problem with similar recomendations") + + return i2i + + +sim_items_store = SimilarItems( + PATH_TO_SIMILAR_ITEMS, + columns=["item_id", "sim_item_id", "score"]) + +logger.info("Ready!") + +# создаём приложение FastAPI +app = FastAPI(title="features") + +@app.get("/similar_items") +async def recommendations(item_id: int, k: int = 10): + """ + Возвращает список похожих объектов длиной k для item_id + """ + i2i = sim_items_store.get(item_id, k) + + return i2i \ No newline at end of file diff --git a/assets/recsys/service/features/requirements.txt b/assets/recsys/service/features/requirements.txt new file mode 100644 index 0000000..54b0a0b --- /dev/null +++ b/assets/recsys/service/features/requirements.txt @@ -0,0 +1,6 @@ +pandas +matplotlib +pyarrow +implicit==0.7.2 +fastapi +uvicorn diff --git a/assets/recsys/service/recommendations/Dockerfile b/assets/recsys/service/recommendations/Dockerfile new file mode 100644 index 0000000..69f005d --- /dev/null +++ b/assets/recsys/service/recommendations/Dockerfile @@ -0,0 +1,9 @@ +FROM python:3.11-slim + +COPY . /recs_app +WORKDIR /recs_app +RUN pip install -r requirements.txt +RUN export $(cat ./.env | xargs) +EXPOSE 8000 + +CMD ["uvicorn", "recommendation_service:app", "--port", "8000", "--host", "0.0.0.0"] \ No newline at end of file diff --git a/assets/recsys/service/recommendations/rec_handler.py b/assets/recsys/service/recommendations/rec_handler.py new file mode 100644 index 0000000..44b23e9 --- /dev/null +++ b/assets/recsys/service/recommendations/rec_handler.py @@ -0,0 +1,43 @@ +import logging as logger +import pandas as pd + +class Recommendations: + + def __init__(self): + + self._recs = {"personal": None, "default": None} + + def load(self, rec_type, path, **kwargs): + """ + Загружает рекомендации из файла + """ + + logger.info(f"Loading recommendations, type: {rec_type}") + self._recs[rec_type] = pd.read_parquet(path, **kwargs) + if rec_type == "personal": + self._recs[rec_type] = self._recs[rec_type].set_index("user_id") + logger.info(f"Loaded") + + def get(self, user_id: int, k: int=100): + """ + Возвращает список рекомендаций для пользователя + """ + try: + recs = self._recs["personal"].loc[user_id] + recs = recs["item_id"].to_list()[:k] + except KeyError: + recs = self._recs["default"] + recs = recs["item_id"].to_list()[:k] + except: + logger.error("No recommendations found") + recs = [] + + if not recs: + logger.warning(f"No default recommendations available for user {user_id}") + recs = [] + else: + logger.info(f'recs: {recs}') + + return recs + + diff --git a/assets/recsys/service/recommendations/recommendation_service.py b/assets/recsys/service/recommendations/recommendation_service.py new file mode 100644 index 0000000..3122d58 --- /dev/null +++ b/assets/recsys/service/recommendations/recommendation_service.py @@ -0,0 +1,116 @@ +import logging +import requests +import os +import pandas as pd + +from fastapi import FastAPI +from rec_handler import Recommendations + +PATH_TO_RECOMENDATIONS = '../../recommendations/' +FILENAME_PERS_RECOMENDATIONS = 'personal_als.parquet' +FILENAME_TOP_POPULAR = 'top_popular.parquet' + +logger = logging.getLogger("uvicorn.error") + +rec_store = Recommendations() +features_store_url = "http://localhost:8010" +events_store_url = "http://localhost:8020" + + +logger.info("Starting") + +rec_store.load( +"personal", +PATH_TO_RECOMENDATIONS+FILENAME_PERS_RECOMENDATIONS, +columns=["user_id", "item_id", "score"], +) +rec_store.load( + "default", + PATH_TO_RECOMENDATIONS+FILENAME_TOP_POPULAR, + columns=["item_id", "rank"], +) + + +# создаём приложение FastAPI +app = FastAPI(title="recommendations") + + +@app.post("/recommendations_offline") +async def recommendations_offline(user_id: int, k: int = 100): + """ + Возвращает список рекомендаций длиной k для пользователя user_id + """ + + recs = rec_store.get(user_id=user_id, k=k) + return {"recs": recs} + + +@app.post("/recommendations_online") +async def recommendations_online(user_id: int, k: int = 100): + """ + Возвращает список онлайн-рекомендаций длиной k для пользователя user_id + """ + + headers = {"Content-type": "application/json", "Accept": "text/plain"} + + # получаем последние события пользователя + params = {"user_id": user_id, "k": 3} + resp = requests.get(events_store_url + "/get", headers=headers, params=params) + events = resp.json() + events = events["events"] + + # получаем список похожих объектов + if len(events) > 0: + items = [] + scores = [] + for item_id in events: + params = {"item_id": item_id, "k": k} + headers = {"Content-type": "application/json", "Accept": "text/plain"} + resp = requests.get(features_store_url + "/similar_items", headers=headers, params=params) + if resp.status_code == 200: + similar_items = resp.json() + else: + similar_items = None + print(f"status code: {resp.status_code}") + items += similar_items["item_id_2"] + scores += similar_items["score"] + combined = list(zip(items, scores)) + combined = sorted(combined, key=lambda x: x[1], reverse=True) + combined = [item for item, _ in combined] + recs = combined[:k] + + else: + recs = [] + + return {"recs": recs} + + +@app.post("/recommendations") +async def recommendations(user_id: int, k: int = 100): + """ + Возвращает список рекомендаций длиной k для пользователя user_id + """ + + recs_offline = await recommendations_offline(user_id, k) + recs_online = await recommendations_online(user_id, k) + + recs_offline = recs_offline["recs"] + recs_online = recs_online["recs"] + recs_blended = [] + + min_length = min(len(recs_offline), len(recs_online)) + # чередуем элементы из списков, пока позволяет минимальная длина + for i in range(min_length): + recs_blended.append(recs_online[i]) + recs_blended.append(recs_offline[i]) + + # добавляем оставшиеся элементы в конец + recs_blended += recs_online[min_length:] + recs_blended += recs_offline[min_length:] + + # оставляем только первые k рекомендаций + recs_blended = recs_blended[:k] + + return {"recs": recs_blended} + + diff --git a/assets/recsys/service/recommendations/requirements.txt b/assets/recsys/service/recommendations/requirements.txt new file mode 100644 index 0000000..a62fbe7 --- /dev/null +++ b/assets/recsys/service/recommendations/requirements.txt @@ -0,0 +1,12 @@ +pandas +matplotlib +pyarrow==13.0.0 +mlflow==2.7.1 +psycopg==3.1.12 +psycopg[binary,pool] +boto3==1.34.78 +implicit==0.7.2 +fastapi +uvicorn +prometheus-fastapi-instrumentator +python-dotenv \ No newline at end of file