{ "cells": [ { "cell_type": "code", "execution_count": 14, "source": [ "# This Notebook is created with VS Code on Windows\r\n", "# Create python virtual environment\r\n", "#!python -m venv .venv\r\n", "# If you want to use it on macOS/Linux\r\n", "# You may need to run sudo apt-get install python3-venv first\r\n", "#python3 -m venv .venv\r\n", "\r\n", "# Install Python Packages\r\n", "!pip install --user --upgrade pip\r\n", "!pip install --user seaborn\r\n", "!pip install --user numpy\r\n", "!pip install --user pandas\r\n", "!pip install --user matplotlib\r\n", "!pip install --user plotly\r\n", "!pip install --user nbformat\r\n" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: pip in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (21.2.4)\n", "Requirement already satisfied: seaborn in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (0.11.2)\n", "Requirement already satisfied: matplotlib>=2.2 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (3.3.4)\n", "Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.20.1)\n", "Requirement already satisfied: scipy>=1.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.6.1)\n", "Requirement already satisfied: pandas>=0.23 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.2.2)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.1.0)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib>=2.2->seaborn) (2.4.6)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.15.0)\n", "Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas>=0.23->seaborn) (2021.1)\n", "Requirement already satisfied: numpy in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.20.1)\n", "Requirement already satisfied: pandas in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.2.2)\n", "Requirement already satisfied: numpy>=1.16.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (1.20.1)\n", "Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2021.1)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2.8.1)\n", "Requirement already satisfied: six>=1.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", "Requirement already satisfied: matplotlib in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (3.3.4)\n", "Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.20.1)\n", "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib) (2.4.6)\n", "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (2.8.1)\n", "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.3.1)\n", "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (8.1.0)\n", "Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (0.10.0)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from cycler>=0.10->matplotlib) (1.15.0)\n", "Requirement already satisfied: plotly in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (5.3.0)\n", "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from plotly) (1.15.0)\n", "Requirement already satisfied: tenacity>=6.2.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from plotly) (8.0.1)\n", "Collecting nbformat\n", " Downloading nbformat-5.1.3-py3-none-any.whl (178 kB)\n", "Requirement already satisfied: ipython-genutils in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (0.2.0)\n", "Requirement already satisfied: traitlets>=4.1 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (5.0.5)\n", "Collecting jsonschema!=2.5.0,>=2.4\n", " Downloading jsonschema-3.2.0-py2.py3-none-any.whl (56 kB)\n", "Requirement already satisfied: jupyter-core in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (4.7.1)\n", "Requirement already satisfied: six>=1.11.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (1.15.0)\n", "Requirement already satisfied: attrs>=17.4.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (19.3.0)\n", "Requirement already satisfied: setuptools in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (49.2.1)\n", "Collecting pyrsistent>=0.14.0\n", " Downloading pyrsistent-0.18.0-cp38-cp38-win_amd64.whl (62 kB)\n", "Requirement already satisfied: pywin32>=1.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jupyter-core->nbformat) (300)\n", "Installing collected packages: pyrsistent, jsonschema, nbformat\n", "Successfully installed jsonschema-3.2.0 nbformat-5.1.3 pyrsistent-0.18.0\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ " WARNING: The script jsonschema.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", " WARNING: The script jupyter-trust.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 8, "source": [ "import numpy as np # linear algebra\r\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\r\n", "import matplotlib.pyplot as plt\r\n", "import seaborn as sns\r\n", "import os\r\n", "import re\r\n", "\r\n", "from plotly.offline import init_notebook_mode, iplot\r\n", "import plotly.graph_objs as go\r\n", "import plotly.offline as py\r\n", "py.init_notebook_mode(connected=True)\r\n", "\r\n", "import warnings\r\n", "warnings.filterwarnings('ignore')\r\n", "\r\n", "plt.style.use('fivethirtyeight')\r\n", "plt.rcParams['figure.figsize'] = [18, 8]" ], "outputs": [ { "output_type": "display_data", "data": { "text/html": [ " \n", " " ] }, "metadata": {} } ], "metadata": {} }, { "cell_type": "code", "execution_count": 23, "source": [ "# Import Tables\r\n", "reviews = pd.read_csv('./ml-latest-small/ratings.csv', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter=',', engine='python')\r\n", "movies = pd.read_csv('./ml-latest-small/movies.csv', names=['movieId', 'title', 'genres'], delimiter=',', engine='python')\r\n", "users = pd.read_csv('./ml-latest-small/users.csv', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')\r\n", "\r\n", "# Print Table shape\r\n", "print('Reviews shape:', reviews.shape)\r\n", "print('Users shape:', users.shape)\r\n", "print('Movies shape:', movies.shape)" ], "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Reviews shape: (100836, 4)\n", "Users shape: (610, 5)\n", "Movies shape: (9742, 3)\n" ] } ], "metadata": {} }, { "cell_type": "code", "execution_count": 25, "source": [ "# Drop unused Attributes\r\n", "reviews.drop(['timestamp'], axis=1, inplace=True) # Time\r\n", "users.drop(['zip'], axis=1, inplace=True) # Zip-Code\r\n", "\r\n", "# Extract the movie year from title to extra attrbute\r\n", "movies['release_year'] = movies['title'].str.extract(r'(?:\\((\\d{4})\\))?\\s*$', expand=False)" ], "outputs": [], "metadata": {} }, { "cell_type": "code", "execution_count": 26, "source": [ "# print movie table\r\n", "movies.head()" ], "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " movieId title \\\n", "0 1 Toy Story (1995) \n", "1 2 Jumanji (1995) \n", "2 3 Grumpier Old Men (1995) \n", "3 4 Waiting to Exhale (1995) \n", "4 5 Father of the Bride Part II (1995) \n", "\n", " genres release_year \n", "0 Adventure|Animation|Children|Comedy|Fantasy 1995 \n", "1 Adventure|Children|Fantasy 1995 \n", "2 Comedy|Romance 1995 \n", "3 Comedy|Drama|Romance 1995 \n", "4 Comedy 1995 " ], "text/html": [ "
| \n", " | movieId | \n", "title | \n", "genres | \n", "release_year | \n", "
|---|---|---|---|---|
| 0 | \n", "1 | \n", "Toy Story (1995) | \n", "Adventure|Animation|Children|Comedy|Fantasy | \n", "1995 | \n", "
| 1 | \n", "2 | \n", "Jumanji (1995) | \n", "Adventure|Children|Fantasy | \n", "1995 | \n", "
| 2 | \n", "3 | \n", "Grumpier Old Men (1995) | \n", "Comedy|Romance | \n", "1995 | \n", "
| 3 | \n", "4 | \n", "Waiting to Exhale (1995) | \n", "Comedy|Drama|Romance | \n", "1995 | \n", "
| 4 | \n", "5 | \n", "Father of the Bride Part II (1995) | \n", "Comedy | \n", "1995 | \n", "