{ "cells": [ { "cell_type": "code", "execution_count": 1, "source": [ "# This Notebook is created with VS Code on Windows\r\n", "# Create python virtual environment\r\n", "!python -m venv .venv\r\n", "# If you want to use it on macOS/Linux\r\n", "# You may need to run sudo apt-get install python3-venv first\r\n", "#python3 -m venv .venv\r\n", "\r\n", "# Install Python Packages\r\n", "!pip install --user --upgrade pip\r\n", "!pip install --upgrade setuptools\r\n", "!pip install --user seaborn\r\n", "!pip install --user numpy\r\n", "!pip install --user pandas\r\n", "!pip install --user matplotlib\r\n", "!pip install --user plotly\r\n", "!pip install --user nbformat\r\n", "!pip install --user surprise\r\n" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: pip in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (21.2.4)\n", "Requirement already satisfied: setuptools in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (57.4.0)\n", "Requirement already satisfied: seaborn in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (0.11.2)\n", "Requirement already satisfied: matplotlib>=2.2 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (3.3.4)\n", "Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.19.5)\n", "Requirement already satisfied: scipy>=1.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.6.1)\n", "Requirement already satisfied: pandas>=0.23 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.2.2)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.1.0)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.4.7)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.15.0)\n", "Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas>=0.23->seaborn) (2021.1)\n", "Requirement already satisfied: numpy in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.19.5)\n", "Requirement already satisfied: pandas in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.2.2)\n", "Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2021.1)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from pandas) (2.8.1)\n", "Requirement already satisfied: numpy>=1.16.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (1.19.5)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", "Requirement already satisfied: matplotlib in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (3.3.4)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (8.1.0)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (2.4.7)\n", "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib) (2.8.1)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (0.10.0)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.3.1)\n", "Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.19.5)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from cycler>=0.10->matplotlib) (1.15.0)\n", "Requirement already satisfied: plotly in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (5.3.0)\n", "Requirement already satisfied: tenacity>=6.2.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from plotly) (8.0.1)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from plotly) (1.15.0)\n", "Requirement already satisfied: nbformat in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (5.1.3)\n", "Requirement already satisfied: jupyter-core in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (4.7.1)\n", "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (3.2.0)\n", "Requirement already satisfied: ipython-genutils in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (0.2.0)\n", "Requirement already satisfied: traitlets>=4.1 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (5.0.5)\n", "Requirement already satisfied: six>=1.11.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (1.15.0)\n", "Requirement already satisfied: attrs>=17.4.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (21.2.0)\n", "Requirement already satisfied: pyrsistent>=0.14.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (0.18.0)\n", "Requirement already satisfied: setuptools in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (57.4.0)\n", "Requirement already satisfied: pywin32>=1.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jupyter-core->nbformat) (300)\n", "Collecting surprise\n", " Using cached surprise-0.1-py2.py3-none-any.whl (1.8 kB)\n", "Collecting scikit-surprise\n", " Using cached scikit-surprise-1.1.1.tar.gz (11.8 MB)\n", "Requirement already satisfied: joblib>=0.11 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from scikit-surprise->surprise) (1.0.1)\n", "Requirement already satisfied: numpy>=1.11.2 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from scikit-surprise->surprise) (1.19.5)\n", "Requirement already satisfied: scipy>=1.0.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from scikit-surprise->surprise) (1.6.1)\n", "Requirement already satisfied: six>=1.10.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from scikit-surprise->surprise) (1.15.0)\n", "Building wheels for collected packages: scikit-surprise\n", " Building wheel for scikit-surprise (setup.py): started\n", " Building wheel for scikit-surprise (setup.py): finished with status 'done'\n", " Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp38-cp38-win_amd64.whl size=734505 sha256=072c5d6f9bb826bb28bec68bdb20bf4de7eec14a769df95110fef9cd4def197d\n", " Stored in directory: c:\\users\\oli\\appdata\\local\\pip\\cache\\wheels\\20\\91\\57\\2965d4cff1b8ac7ed1b6fa25741882af3974b54a31759e10b6\n", "Successfully built scikit-surprise\n", "Installing collected packages: scikit-surprise, surprise\n", "Successfully installed scikit-surprise-1.1.1 surprise-0.1\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ " WARNING: The script surprise.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 2, "source": [ "import numpy as np # maths\r\n", "import pandas as pd # data processing\r\n", "import matplotlib.pyplot as plt\r\n", "import seaborn as sns\r\n", "import os\r\n", "import re\r\n", "\r\n", "from plotly.offline import init_notebook_mode, iplot\r\n", "import plotly.graph_objs as go\r\n", "import plotly.offline as py\r\n", "py.init_notebook_mode(connected=True)\r\n", "\r\n", "import warnings\r\n", "warnings.filterwarnings('ignore')\r\n", "\r\n", "plt.style.use('fivethirtyeight')\r\n", "plt.rcParams['figure.figsize'] = [18, 8]" ], "outputs": [ { "output_type": "display_data", "data": { "text/html": [ " \n", " " ] }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": 3, "source": [ "# Import Tables\r\n", "reviews = pd.read_csv('./ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter='::', engine='python')\r\n", "movies = pd.read_csv('./ml-1m/movies.dat', names=['movieId', 'title', 'genres'], delimiter='::', engine='python')\r\n", "users = pd.read_csv('./ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')\r\n", "\r\n", "# Print Table shape\r\n", "print('Reviews shape:', reviews.shape)\r\n", "print('Users shape:', users.shape)\r\n", "print('Movies shape:', movies.shape)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Reviews shape: (1000209, 4)\n", "Users shape: (6040, 5)\n", "Movies shape: (3883, 3)\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 4, "source": [ "# Drop unused Attributes\r\n", "reviews.drop(['timestamp'], axis=1, inplace=True) # Time\r\n", "users.drop(['zip'], axis=1, inplace=True) # Zip-Code\r\n", "\r\n", "# Extract the movie year from title to extra attrbute\r\n", "movies['release_year'] = movies['title'].str.extract(r'(?:\\((\\d{4})\\))?\\s*$', expand=False)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 5, "source": [ "# Print movie table\r\n", "movies.head()" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId title genres \\\n", "0 1 Toy Story (1995) Animation|Children's|Comedy \n", "1 2 Jumanji (1995) Adventure|Children's|Fantasy \n", "2 3 Grumpier Old Men (1995) Comedy|Romance \n", "3 4 Waiting to Exhale (1995) Comedy|Drama \n", "4 5 Father of the Bride Part II (1995) Comedy \n", "\n", " release_year \n", "0 1995 \n", "1 1995 \n", "2 1995 \n", "3 1995 \n", "4 1995 " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenresrelease_year
01Toy Story (1995)Animation|Children's|Comedy1995
12Jumanji (1995)Adventure|Children's|Fantasy1995
23Grumpier Old Men (1995)Comedy|Romance1995
34Waiting to Exhale (1995)Comedy|Drama1995
45Father of the Bride Part II (1995)Comedy1995
\n", "
" ] }, "metadata": {}, "execution_count": 5 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 6, "source": [ "# Changed feature values based on README_users.txt\r\n", "ages_map = {1: 'Under 18',\r\n", " 18: '18 - 24',\r\n", " 25: '25 - 34',\r\n", " 35: '35 - 44',\r\n", " 45: '45 - 49',\r\n", " 50: '50 - 55',\r\n", " 56: '56+'}\r\n", "\r\n", "occupations_map = {0: 'Not specified',\r\n", " 1: 'Academic / Educator',\r\n", " 2: 'Artist',\r\n", " 3: 'Clerical / Admin',\r\n", " 4: 'College / Grad Student',\r\n", " 5: 'Customer Service',\r\n", " 6: 'Doctor / Health Care',\r\n", " 7: 'Executive / Managerial',\r\n", " 8: 'Farmer',\r\n", " 9: 'Homemaker',\r\n", " 10: 'K-12 student',\r\n", " 11: 'Lawyer',\r\n", " 12: 'Programmer',\r\n", " 13: 'Retired',\r\n", " 14: 'Sales / Marketing',\r\n", " 15: 'Scientist',\r\n", " 16: 'Self-Employed',\r\n", " 17: 'Technician / Engineer',\r\n", " 18: 'Tradesman / Craftsman',\r\n", " 19: 'Unemployed',\r\n", " 20: 'Writer'}\r\n", "\r\n", "gender_map = {'M': 'Male', 'F': 'Female'}\r\n", "\r\n", "users['age'] = users['age'].map(ages_map)\r\n", "users['occupation'] = users['occupation'].map(occupations_map)\r\n", "users['gender'] = users['gender'].map(gender_map)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 7, "source": [ "# Plot age kategories\r\n", "\r\n", "age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+']\r\n", "age_counts = users['age'].value_counts().reindex(age_reindex)\r\n", "sns.barplot(x=age_counts.values,\r\n", " y=age_counts.index,\r\n", " palette='magma').set_title(\r\n", " 'Users age', fontsize=12)\r\n", "\r\n", "plt.show()" ], "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/svg+xml": "\r\n\r\n\r\n\r\n \r\n \r\n \r\n \r\n 2021-08-31T13:50:41.214400\r\n image/svg+xml\r\n \r\n \r\n Matplotlib v3.3.4, https://matplotlib.org/\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\r\n", "image/png": "" }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": 8, "source": [ "# Plot gender of users\r\n", "gender_counts = users['gender'].value_counts()\r\n", "colors1 = ['lightblue', 'pink']\r\n", "pie = go.Pie(labels=gender_counts.index,\r\n", " values=gender_counts.values,\r\n", " marker=dict(colors=colors1),\r\n", " hole=0.5)\r\n", "layout = go.Layout(title='Gender Users', font=dict(size=12), legend=dict(orientation='h'))\r\n", "\r\n", "fig = go.Figure(data=[pie], layout=layout)\r\n", "py.iplot(fig)" ], "outputs": [ { "output_type": "display_data", "data": { "text/html": [ "
" ], "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "hole": 0.5, "labels": [ "Male", "Female" ], "marker": { "colors": [ "lightblue", "pink" ] }, "type": "pie", "values": [ 4331, 1709 ] } ], "layout": { "font": { "size": 12 }, "legend": { "orientation": "h" }, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Gender Users" } } } }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": 9, "source": [ "# Merge reviews, movie and user dataset\r\n", "final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')\r\n", "print('final_df shape:', final_df.shape)\r\n", "final_df.head()" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "final_df shape: (1000209, 9)\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ " userId movieId rating title \\\n", "0 1 1193 5 One Flew Over the Cuckoo's Nest (1975) \n", "1 1 661 3 James and the Giant Peach (1996) \n", "2 1 914 3 My Fair Lady (1964) \n", "3 1 3408 4 Erin Brockovich (2000) \n", "4 1 2355 5 Bug's Life, A (1998) \n", "\n", " genres release_year gender age occupation \n", "0 Drama 1975 Female Under 18 K-12 student \n", "1 Animation|Children's|Musical 1996 Female Under 18 K-12 student \n", "2 Musical|Romance 1964 Female Under 18 K-12 student \n", "3 Drama 2000 Female Under 18 K-12 student \n", "4 Animation|Children's|Comedy 1998 Female Under 18 K-12 student " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
userIdmovieIdratingtitlegenresrelease_yeargenderageoccupation
0111935One Flew Over the Cuckoo's Nest (1975)Drama1975FemaleUnder 18K-12 student
116613James and the Giant Peach (1996)Animation|Children's|Musical1996FemaleUnder 18K-12 student
219143My Fair Lady (1964)Musical|Romance1964FemaleUnder 18K-12 student
3134084Erin Brockovich (2000)Drama2000FemaleUnder 18K-12 student
4123555Bug's Life, A (1998)Animation|Children's|Comedy1998FemaleUnder 18K-12 student
\n", "
" ] }, "metadata": {}, "execution_count": 9 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 10, "source": [ "final_df[final_df['age'] == '18 - 24']['title'].value_counts()[:10].to_frame()" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title\n", "American Beauty (1999) 715\n", "Star Wars: Episode VI - Return of the Jedi (1983) 586\n", "Star Wars: Episode V - The Empire Strikes Back ... 579\n", "Matrix, The (1999) 567\n", "Star Wars: Episode IV - A New Hope (1977) 562\n", "Braveheart (1995) 544\n", "Saving Private Ryan (1998) 543\n", "Jurassic Park (1993) 541\n", "Terminator 2: Judgment Day (1991) 529\n", "Men in Black (1997) 514" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
title
American Beauty (1999)715
Star Wars: Episode VI - Return of the Jedi (1983)586
Star Wars: Episode V - The Empire Strikes Back (1980)579
Matrix, The (1999)567
Star Wars: Episode IV - A New Hope (1977)562
Braveheart (1995)544
Saving Private Ryan (1998)543
Jurassic Park (1993)541
Terminator 2: Judgment Day (1991)529
Men in Black (1997)514
\n", "
" ] }, "metadata": {}, "execution_count": 10 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 11, "source": [ "# Print movie / user sum\r\n", "n_movies = final_df['movieId'].nunique()\r\n", "n_users = final_df['userId'].nunique()\r\n", "\r\n", "print('Number of movies:', n_movies)\r\n", "print('Number of users:', n_users) " ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Number of movies: 3706\n", "Number of users: 6040\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 12, "source": [ "# implement SVD with Python SurPRISE, a Python Recommendation Framework\r\n", "\r\n", "from surprise import Reader, Dataset, SVD, SVDpp\r\n", "from surprise import accuracy\r\n", "\r\n", "reader = Reader(rating_scale=(1, 5))\r\n", "dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader)\r\n", "\r\n", "svd = SVD(n_factors=50)\r\n", "svd_plusplus = SVDpp(n_factors=50)\r\n", "\r\n", "# train with SVD\r\n", "trainset = dataset.build_full_trainset()\r\n", "svd.fit(trainset)\r\n", "# train with SVD++, ATTENTION this take a LONG TIME\r\n", "# svd_plusplus.fit(trainset)\r\n" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": {}, "execution_count": 12 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 15, "source": [ "# Show titels instead of ids\r\n", "id_2_names = dict()\r\n", "for idx, names in zip(movies['movieId'], movies['title']):\r\n", " id_2_names[idx] = names" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 16, "source": [ "# function for test set\r\n", "def Build_Anti_Testset4User(user_id):\r\n", " \r\n", " fill = trainset.global_mean\r\n", " anti_testset = list()\r\n", " u = trainset.to_inner_uid(user_id)\r\n", " \r\n", " # ur == users ratings\r\n", " user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]])\r\n", " \r\n", " anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for\r\n", " i in trainset.all_items() if i not in user_items]\r\n", " \r\n", " return anti_testset" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 19, "source": [ "# Implement Top-X Chart recommender\r\n", "def TopXRec_SVD(user_id, num_recommender=10, latest=False):\r\n", " \r\n", " testSet = Build_Anti_Testset4User(user_id)\r\n", " predict = svd.test(testSet) # here you can change to SVD++\r\n", " \r\n", " recommendation = list()\r\n", " \r\n", " for userID, movieID, actualRating, estimatedRating, _ in predict:\r\n", " intMovieID = int(movieID)\r\n", " recommendation.append((intMovieID, estimatedRating))\r\n", " \r\n", " recommendation.sort(key=lambda x: x[1], reverse=True)\r\n", " \r\n", " movie_names = []\r\n", " movie_ratings = []\r\n", " \r\n", " for name, ratings in recommendation[:20]:\r\n", " movie_names.append(id_2_names[name])\r\n", " movie_ratings.append(ratings)\r\n", " \r\n", " movie_dataframe = pd.DataFrame({'title': movie_names,\r\n", " 'rating': movie_ratings}).merge(movies[['title', 'release_year']],\r\n", " on='title', how='left')\r\n", " \r\n", " if latest == True:\r\n", " return movie_dataframe.sort_values('release_year', ascending=False)[['title', 'rating']].head(num_recommender)\r\n", " \r\n", " else:\r\n", " return movie_dataframe.drop('release_year', axis=1).head(num_recommender)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 37, "source": [ "TopXRec_SVD(363, num_recommender=10)\r\n", "TopXRec_SVD(363, num_recommender=10, latest=True)" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title rating\n", "11 Wallace & Gromit: The Best of Aardman Animatio... 4.705187\n", "2 Close Shave, A (1995) 4.832506\n", "12 Shawshank Redemption, The (1994) 4.700431\n", "1 Wrong Trousers, The (1993) 4.869839\n", "19 Grand Day Out, A (1992) 4.672615\n", "4 Blade Runner (1982) 4.819672\n", "15 Apocalypse Now (1979) 4.683492\n", "7 One Flew Over the Cuckoo's Nest (1975) 4.738641\n", "16 Young Frankenstein (1974) 4.682844\n", "10 Monty Python and the Holy Grail (1974) 4.705298" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlerating
11Wallace & Gromit: The Best of Aardman Animatio...4.705187
2Close Shave, A (1995)4.832506
12Shawshank Redemption, The (1994)4.700431
1Wrong Trousers, The (1993)4.869839
19Grand Day Out, A (1992)4.672615
4Blade Runner (1982)4.819672
15Apocalypse Now (1979)4.683492
7One Flew Over the Cuckoo's Nest (1975)4.738641
16Young Frankenstein (1974)4.682844
10Monty Python and the Holy Grail (1974)4.705298
\n", "
" ] }, "metadata": {}, "execution_count": 37 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 36, "source": [], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " title rating\n", "11 Wallace & Gromit: The Best of Aardman Animatio... 4.705187\n", "2 Close Shave, A (1995) 4.832506\n", "12 Shawshank Redemption, The (1994) 4.700431\n", "1 Wrong Trousers, The (1993) 4.869839\n", "19 Grand Day Out, A (1992) 4.672615\n", "4 Blade Runner (1982) 4.819672\n", "15 Apocalypse Now (1979) 4.683492\n", "7 One Flew Over the Cuckoo's Nest (1975) 4.738641\n", "16 Young Frankenstein (1974) 4.682844\n", "10 Monty Python and the Holy Grail (1974) 4.705298" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
titlerating
11Wallace & Gromit: The Best of Aardman Animatio...4.705187
2Close Shave, A (1995)4.832506
12Shawshank Redemption, The (1994)4.700431
1Wrong Trousers, The (1993)4.869839
19Grand Day Out, A (1992)4.672615
4Blade Runner (1982)4.819672
15Apocalypse Now (1979)4.683492
7One Flew Over the Cuckoo's Nest (1975)4.738641
16Young Frankenstein (1974)4.682844
10Monty Python and the Holy Grail (1974)4.705298
\n", "
" ] }, "metadata": {}, "execution_count": 36 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 43, "source": [ "# Evaluation\r\n", "\r\n", "# Than predict ratings for all pairs not in training set.\r\n", "testset = trainset.build_anti_testset()\r\n", "predictions_svd = svd.test(testset)\r\n", "print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False))\r\n", "print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "SVD - RMSE: 0.7015239638211899\n", "SVD - MAE: 0.5429390320069348\n" ] } ], "metadata": {} } ], "metadata": { "orig_nbformat": 4, "language_info": { "name": "python", "version": "3.8.8", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kernelspec": { "name": "python3", "display_name": "Python 3.8.8 64-bit" }, "interpreter": { "hash": "53e4db133e7a886bd36ef8c79c0b5519f0af174d53fdba9ad5d5d94e6d9f4b55" } }, "nbformat": 4, "nbformat_minor": 2 }