From 610c14a5620699db812711e7e5a5132b386fee83 Mon Sep 17 00:00:00 2001 From: Oli Date: Mon, 30 Aug 2021 17:27:44 +0200 Subject: [PATCH] load dataset and extract attributes --- test.ipynb | 204 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) create mode 100644 test.ipynb diff --git a/test.ipynb b/test.ipynb new file mode 100644 index 0000000..1653575 --- /dev/null +++ b/test.ipynb @@ -0,0 +1,204 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 14, + "source": [ + "# This Notebook is created with VS Code on Windows\r\n", + "# Create python virtual environment\r\n", + "#!python -m venv .venv\r\n", + "# If you want to use it on macOS/Linux\r\n", + "# You may need to run sudo apt-get install python3-venv first\r\n", + "#python3 -m venv .venv\r\n", + "\r\n", + "# Install Python Packages\r\n", + "!pip install --user --upgrade pip\r\n", + "!pip install --user seaborn\r\n", + "!pip install --user numpy\r\n", + "!pip install --user pandas\r\n", + "!pip install --user matplotlib\r\n", + "!pip install --user plotly\r\n", + "!pip install --user nbformat\r\n" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Requirement already satisfied: pip in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (21.2.4)\n", + "Requirement already satisfied: seaborn in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (0.11.2)\n", + "Requirement already satisfied: matplotlib>=2.2 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (3.3.4)\n", + "Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.20.1)\n", + "Requirement already satisfied: scipy>=1.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.6.1)\n", + "Requirement already satisfied: pandas>=0.23 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.2.2)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n", + "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.1.0)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib>=2.2->seaborn) (2.4.6)\n", + "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.15.0)\n", + "Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas>=0.23->seaborn) (2021.1)\n", + "Requirement already satisfied: numpy in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.20.1)\n", + "Requirement already satisfied: pandas in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.2.2)\n", + "Requirement already satisfied: numpy>=1.16.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (1.20.1)\n", + "Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2021.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2.8.1)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n", + "Requirement already satisfied: matplotlib in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (3.3.4)\n", + "Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.20.1)\n", + "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib) (2.4.6)\n", + "Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (2.8.1)\n", + "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.3.1)\n", + "Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (8.1.0)\n", + "Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (0.10.0)\n", + "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from cycler>=0.10->matplotlib) (1.15.0)\n", + "Requirement already satisfied: plotly in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (5.3.0)\n", + "Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from plotly) (1.15.0)\n", + "Requirement already satisfied: tenacity>=6.2.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from plotly) (8.0.1)\n", + "Collecting nbformat\n", + " Downloading nbformat-5.1.3-py3-none-any.whl (178 kB)\n", + "Requirement already satisfied: ipython-genutils in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (0.2.0)\n", + "Requirement already satisfied: traitlets>=4.1 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (5.0.5)\n", + "Collecting jsonschema!=2.5.0,>=2.4\n", + " Downloading jsonschema-3.2.0-py2.py3-none-any.whl (56 kB)\n", + "Requirement already satisfied: jupyter-core in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (4.7.1)\n", + "Requirement already satisfied: six>=1.11.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (1.15.0)\n", + "Requirement already satisfied: attrs>=17.4.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (19.3.0)\n", + "Requirement already satisfied: setuptools in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (49.2.1)\n", + "Collecting pyrsistent>=0.14.0\n", + " Downloading pyrsistent-0.18.0-cp38-cp38-win_amd64.whl (62 kB)\n", + "Requirement already satisfied: pywin32>=1.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jupyter-core->nbformat) (300)\n", + "Installing collected packages: pyrsistent, jsonschema, nbformat\n", + "Successfully installed jsonschema-3.2.0 nbformat-5.1.3 pyrsistent-0.18.0\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "text": [ + " WARNING: The script jsonschema.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " WARNING: The script jupyter-trust.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 8, + "source": [ + "import numpy as np # linear algebra\r\n", + "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\r\n", + "import matplotlib.pyplot as plt\r\n", + "import seaborn as sns\r\n", + "import os\r\n", + "import re\r\n", + "\r\n", + "from plotly.offline import init_notebook_mode, iplot\r\n", + "import plotly.graph_objs as go\r\n", + "import plotly.offline as py\r\n", + "py.init_notebook_mode(connected=True)\r\n", + "\r\n", + "import warnings\r\n", + "warnings.filterwarnings('ignore')\r\n", + "\r\n", + "plt.style.use('fivethirtyeight')\r\n", + "plt.rcParams['figure.figsize'] = [18, 8]" + ], + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + " \n", + " " + ] + }, + "metadata": {} + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 23, + "source": [ + "# Import Tables\r\n", + "reviews = pd.read_csv('./ml-latest-small/ratings.csv', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter=',', engine='python')\r\n", + "movies = pd.read_csv('./ml-latest-small/movies.csv', names=['movieId', 'title', 'genres'], delimiter=',', engine='python')\r\n", + "users = pd.read_csv('./ml-latest-small/users.csv', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')\r\n", + "\r\n", + "# Print Table shape\r\n", + "print('Reviews shape:', reviews.shape)\r\n", + "print('Users shape:', users.shape)\r\n", + "print('Movies shape:', movies.shape)" + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Reviews shape: (100836, 4)\n", + "Users shape: (610, 5)\n", + "Movies shape: (9742, 3)\n" + ] + } + ], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 25, + "source": [ + "# Drop unused Attributes\r\n", + "reviews.drop(['timestamp'], axis=1, inplace=True) # Time\r\n", + "users.drop(['zip'], axis=1, inplace=True) # Zip-Code\r\n", + "\r\n", + "# Extract the movie year from title to extra attrbute\r\n", + "movies['release_year'] = movies['title'].str.extract(r'(?:\\((\\d{4})\\))?\\s*$', expand=False)" + ], + "outputs": [], + "metadata": {} + } + ], + "metadata": { + "orig_nbformat": 4, + "language_info": { + "name": "python", + "version": "3.8.8", + "mimetype": "text/x-python", + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "pygments_lexer": "ipython3", + "nbconvert_exporter": "python", + "file_extension": ".py" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.8.8 64-bit" + }, + "interpreter": { + "hash": "53e4db133e7a886bd36ef8c79c0b5519f0af174d53fdba9ad5d5d94e6d9f4b55" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} \ No newline at end of file