{ "cells": [ { "cell_type": "code", "execution_count": 14, "source": [ "# This Notebook is created with VS Code on Windows\r\n", "# Create python virtual environment\r\n", "#!python -m venv .venv\r\n", "# If you want to use it on macOS/Linux\r\n", "# You may need to run sudo apt-get install python3-venv first\r\n", "#python3 -m venv .venv\r\n", "\r\n", "# Install Python Packages\r\n", "!pip install --user --upgrade pip\r\n", "!pip install --user seaborn\r\n", "!pip install --user numpy\r\n", "!pip install --user pandas\r\n", "!pip install --user matplotlib\r\n", "!pip install --user plotly\r\n", "!pip install --user nbformat\r\n" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: pip in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (21.2.4)\n", "Requirement already satisfied: seaborn in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (0.11.2)\n", "Requirement already satisfied: matplotlib>=2.2 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (3.3.4)\n", "Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.20.1)\n", "Requirement already satisfied: scipy>=1.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.6.1)\n", "Requirement already satisfied: pandas>=0.23 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.2.2)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.1.0)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib>=2.2->seaborn) (2.4.6)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.15.0)\n", "Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas>=0.23->seaborn) (2021.1)\n", "Requirement already satisfied: numpy in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.20.1)\n", "Requirement already satisfied: pandas in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.2.2)\n", "Requirement already satisfied: numpy>=1.16.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (1.20.1)\n", "Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2021.1)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2.8.1)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", "Requirement already satisfied: matplotlib in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (3.3.4)\n", "Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.20.1)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib) (2.4.6)\n", "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (2.8.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.3.1)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (8.1.0)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (0.10.0)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from cycler>=0.10->matplotlib) (1.15.0)\n", "Requirement already satisfied: plotly in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (5.3.0)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from plotly) (1.15.0)\n", "Requirement already satisfied: tenacity>=6.2.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from plotly) (8.0.1)\n", "Collecting nbformat\n", " Downloading nbformat-5.1.3-py3-none-any.whl (178 kB)\n", "Requirement already satisfied: ipython-genutils in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (0.2.0)\n", "Requirement already satisfied: traitlets>=4.1 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (5.0.5)\n", "Collecting jsonschema!=2.5.0,>=2.4\n", " Downloading jsonschema-3.2.0-py2.py3-none-any.whl (56 kB)\n", "Requirement already satisfied: jupyter-core in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (4.7.1)\n", "Requirement already satisfied: six>=1.11.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (1.15.0)\n", "Requirement already satisfied: attrs>=17.4.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (19.3.0)\n", "Requirement already satisfied: setuptools in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (49.2.1)\n", "Collecting pyrsistent>=0.14.0\n", " Downloading pyrsistent-0.18.0-cp38-cp38-win_amd64.whl (62 kB)\n", "Requirement already satisfied: pywin32>=1.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jupyter-core->nbformat) (300)\n", "Installing collected packages: pyrsistent, jsonschema, nbformat\n", "Successfully installed jsonschema-3.2.0 nbformat-5.1.3 pyrsistent-0.18.0\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ " WARNING: The script jsonschema.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", " WARNING: The script jupyter-trust.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 8, "source": [ "import numpy as np # linear algebra\r\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\r\n", "import matplotlib.pyplot as plt\r\n", "import seaborn as sns\r\n", "import os\r\n", "import re\r\n", "\r\n", "from plotly.offline import init_notebook_mode, iplot\r\n", "import plotly.graph_objs as go\r\n", "import plotly.offline as py\r\n", "py.init_notebook_mode(connected=True)\r\n", "\r\n", "import warnings\r\n", "warnings.filterwarnings('ignore')\r\n", "\r\n", "plt.style.use('fivethirtyeight')\r\n", "plt.rcParams['figure.figsize'] = [18, 8]" ], "outputs": [ { "output_type": "display_data", "data": { "text/html": [ " \n", " " ] }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": 23, "source": [ "# Import Tables\r\n", "reviews = pd.read_csv('./ml-latest-small/ratings.csv', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter=',', engine='python')\r\n", "movies = pd.read_csv('./ml-latest-small/movies.csv', names=['movieId', 'title', 'genres'], delimiter=',', engine='python')\r\n", "users = pd.read_csv('./ml-latest-small/users.csv', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')\r\n", "\r\n", "# Print Table shape\r\n", "print('Reviews shape:', reviews.shape)\r\n", "print('Users shape:', users.shape)\r\n", "print('Movies shape:', movies.shape)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Reviews shape: (100836, 4)\n", "Users shape: (610, 5)\n", "Movies shape: (9742, 3)\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 25, "source": [ "# Drop unused Attributes\r\n", "reviews.drop(['timestamp'], axis=1, inplace=True) # Time\r\n", "users.drop(['zip'], axis=1, inplace=True) # Zip-Code\r\n", "\r\n", "# Extract the movie year from title to extra attrbute\r\n", "movies['release_year'] = movies['title'].str.extract(r'(?:\\((\\d{4})\\))?\\s*$', expand=False)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 26, "source": [ "# print movie table\r\n", "movies.head()" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "3 4 Waiting to Exhale (1995) \n", "4 5 Father of the Bride Part II (1995) \n", "\n", " genres release_year \n", "0 Adventure|Animation|Children|Comedy|Fantasy 1995 \n", "1 Adventure|Children|Fantasy 1995 \n", "2 Comedy|Romance 1995 \n", "3 Comedy|Drama|Romance 1995 \n", "4 Comedy 1995 " ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
movieIdtitlegenresrelease_year
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy1995
12Jumanji (1995)Adventure|Children|Fantasy1995
23Grumpier Old Men (1995)Comedy|Romance1995
34Waiting to Exhale (1995)Comedy|Drama|Romance1995
45Father of the Bride Part II (1995)Comedy1995
\n", "
" ] }, "metadata": {}, "execution_count": 26 } ], "metadata": {} }, { "cell_type": "code", "execution_count": 27, "source": [ "# changed feature values based on README_users.txt\r\n", "\r\n", "ages_map = {1: 'Under 18',\r\n", " 18: '18 - 24',\r\n", " 25: '25 - 34',\r\n", " 35: '35 - 44',\r\n", " 45: '45 - 49',\r\n", " 50: '50 - 55',\r\n", " 56: '56+'}\r\n", "\r\n", "occupations_map = {0: 'Not specified',\r\n", " 1: 'Academic / Educator',\r\n", " 2: 'Artist',\r\n", " 3: 'Clerical / Admin',\r\n", " 4: 'College / Grad Student',\r\n", " 5: 'Customer Service',\r\n", " 6: 'Doctor / Health Care',\r\n", " 7: 'Executive / Managerial',\r\n", " 8: 'Farmer',\r\n", " 9: 'Homemaker',\r\n", " 10: 'K-12 student',\r\n", " 11: 'Lawyer',\r\n", " 12: 'Programmer',\r\n", " 13: 'Retired',\r\n", " 14: 'Sales / Marketing',\r\n", " 15: 'Scientist',\r\n", " 16: 'Self-Employed',\r\n", " 17: 'Technician / Engineer',\r\n", " 18: 'Tradesman / Craftsman',\r\n", " 19: 'Unemployed',\r\n", " 20: 'Writer'}\r\n", "\r\n", "gender_map = {'M': 'Male', 'F': 'Female'}\r\n", "\r\n", "users['age'] = users['age'].map(ages_map)\r\n", "users['occupation'] = users['occupation'].map(occupations_map)\r\n", "users['gender'] = users['gender'].map(gender_map)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 28, "source": [ "age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+']\r\n", "\r\n", "age_counts = users['age'].value_counts().reindex(age_reindex)\r\n", "\r\n", "sns.barplot(x=age_counts.values,\r\n", " y=age_counts.index,\r\n", " palette='magma').set_title(\r\n", " 'Users age', fontsize=24)\r\n", "\r\n", "plt.show()" ], "outputs": [ { "output_type": "display_data", "data": { "text/plain": [ "
" ], "image/svg+xml": "\r\n\r\n\r\n\r\n \r\n \r\n \r\n \r\n 2021-08-30T17:25:58.608318\r\n image/svg+xml\r\n \r\n \r\n Matplotlib v3.3.4, https://matplotlib.org/\r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n \r\n\r\n", "image/png": "" }, "metadata": {} } ], "metadata": {} } ], "metadata": { "orig_nbformat": 4, "language_info": { "name": "python", "version": "3.8.8", "mimetype": "text/x-python", "codemirror_mode": { "name": "ipython", "version": 3 }, "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "file_extension": ".py" }, "kernelspec": { "name": "python3", "display_name": "Python 3.8.8 64-bit" }, "interpreter": { "hash": "53e4db133e7a886bd36ef8c79c0b5519f0af174d53fdba9ad5d5d94e6d9f4b55" } }, "nbformat": 4, "nbformat_minor": 2 }