load dataset and extract attributes
This commit is contained in:
204
test.ipynb
Normal file
204
test.ipynb
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"source": [
|
||||||
|
"# This Notebook is created with VS Code on Windows\r\n",
|
||||||
|
"# Create python virtual environment\r\n",
|
||||||
|
"#!python -m venv .venv\r\n",
|
||||||
|
"# If you want to use it on macOS/Linux\r\n",
|
||||||
|
"# You may need to run sudo apt-get install python3-venv first\r\n",
|
||||||
|
"#python3 -m venv .venv\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"# Install Python Packages\r\n",
|
||||||
|
"!pip install --user --upgrade pip\r\n",
|
||||||
|
"!pip install --user seaborn\r\n",
|
||||||
|
"!pip install --user numpy\r\n",
|
||||||
|
"!pip install --user pandas\r\n",
|
||||||
|
"!pip install --user matplotlib\r\n",
|
||||||
|
"!pip install --user plotly\r\n",
|
||||||
|
"!pip install --user nbformat\r\n"
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Requirement already satisfied: pip in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (21.2.4)\n",
|
||||||
|
"Requirement already satisfied: seaborn in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (0.11.2)\n",
|
||||||
|
"Requirement already satisfied: matplotlib>=2.2 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (3.3.4)\n",
|
||||||
|
"Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.20.1)\n",
|
||||||
|
"Requirement already satisfied: scipy>=1.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.6.1)\n",
|
||||||
|
"Requirement already satisfied: pandas>=0.23 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from seaborn) (1.2.2)\n",
|
||||||
|
"Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (0.10.0)\n",
|
||||||
|
"Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (2.8.1)\n",
|
||||||
|
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (1.3.1)\n",
|
||||||
|
"Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib>=2.2->seaborn) (8.1.0)\n",
|
||||||
|
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib>=2.2->seaborn) (2.4.6)\n",
|
||||||
|
"Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from cycler>=0.10->matplotlib>=2.2->seaborn) (1.15.0)\n",
|
||||||
|
"Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas>=0.23->seaborn) (2021.1)\n",
|
||||||
|
"Requirement already satisfied: numpy in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.20.1)\n",
|
||||||
|
"Requirement already satisfied: pandas in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (1.2.2)\n",
|
||||||
|
"Requirement already satisfied: numpy>=1.16.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (1.20.1)\n",
|
||||||
|
"Requirement already satisfied: pytz>=2017.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2021.1)\n",
|
||||||
|
"Requirement already satisfied: python-dateutil>=2.7.3 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from pandas) (2.8.1)\n",
|
||||||
|
"Requirement already satisfied: six>=1.5 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from python-dateutil>=2.7.3->pandas) (1.15.0)\n",
|
||||||
|
"Requirement already satisfied: matplotlib in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (3.3.4)\n",
|
||||||
|
"Requirement already satisfied: numpy>=1.15 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.20.1)\n",
|
||||||
|
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from matplotlib) (2.4.6)\n",
|
||||||
|
"Requirement already satisfied: python-dateutil>=2.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (2.8.1)\n",
|
||||||
|
"Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (1.3.1)\n",
|
||||||
|
"Requirement already satisfied: pillow>=6.2.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (8.1.0)\n",
|
||||||
|
"Requirement already satisfied: cycler>=0.10 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from matplotlib) (0.10.0)\n",
|
||||||
|
"Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from cycler>=0.10->matplotlib) (1.15.0)\n",
|
||||||
|
"Requirement already satisfied: plotly in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (5.3.0)\n",
|
||||||
|
"Requirement already satisfied: six in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from plotly) (1.15.0)\n",
|
||||||
|
"Requirement already satisfied: tenacity>=6.2.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from plotly) (8.0.1)\n",
|
||||||
|
"Collecting nbformat\n",
|
||||||
|
" Downloading nbformat-5.1.3-py3-none-any.whl (178 kB)\n",
|
||||||
|
"Requirement already satisfied: ipython-genutils in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (0.2.0)\n",
|
||||||
|
"Requirement already satisfied: traitlets>=4.1 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (5.0.5)\n",
|
||||||
|
"Collecting jsonschema!=2.5.0,>=2.4\n",
|
||||||
|
" Downloading jsonschema-3.2.0-py2.py3-none-any.whl (56 kB)\n",
|
||||||
|
"Requirement already satisfied: jupyter-core in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from nbformat) (4.7.1)\n",
|
||||||
|
"Requirement already satisfied: six>=1.11.0 in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (1.15.0)\n",
|
||||||
|
"Requirement already satisfied: attrs>=17.4.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (19.3.0)\n",
|
||||||
|
"Requirement already satisfied: setuptools in c:\\users\\oli\\appdata\\local\\programs\\python\\python38\\lib\\site-packages (from jsonschema!=2.5.0,>=2.4->nbformat) (49.2.1)\n",
|
||||||
|
"Collecting pyrsistent>=0.14.0\n",
|
||||||
|
" Downloading pyrsistent-0.18.0-cp38-cp38-win_amd64.whl (62 kB)\n",
|
||||||
|
"Requirement already satisfied: pywin32>=1.0 in c:\\users\\oli\\appdata\\roaming\\python\\python38\\site-packages (from jupyter-core->nbformat) (300)\n",
|
||||||
|
"Installing collected packages: pyrsistent, jsonschema, nbformat\n",
|
||||||
|
"Successfully installed jsonschema-3.2.0 nbformat-5.1.3 pyrsistent-0.18.0\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stderr",
|
||||||
|
"text": [
|
||||||
|
" WARNING: The script jsonschema.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n",
|
||||||
|
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n",
|
||||||
|
" WARNING: The script jupyter-trust.exe is installed in 'C:\\Users\\Oli\\AppData\\Roaming\\Python\\Python38\\Scripts' which is not on PATH.\n",
|
||||||
|
" Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"source": [
|
||||||
|
"import numpy as np # linear algebra\r\n",
|
||||||
|
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\r\n",
|
||||||
|
"import matplotlib.pyplot as plt\r\n",
|
||||||
|
"import seaborn as sns\r\n",
|
||||||
|
"import os\r\n",
|
||||||
|
"import re\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"from plotly.offline import init_notebook_mode, iplot\r\n",
|
||||||
|
"import plotly.graph_objs as go\r\n",
|
||||||
|
"import plotly.offline as py\r\n",
|
||||||
|
"py.init_notebook_mode(connected=True)\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"import warnings\r\n",
|
||||||
|
"warnings.filterwarnings('ignore')\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"plt.style.use('fivethirtyeight')\r\n",
|
||||||
|
"plt.rcParams['figure.figsize'] = [18, 8]"
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "display_data",
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
" <script type=\"text/javascript\">\n",
|
||||||
|
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
|
||||||
|
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
|
||||||
|
" if (typeof require !== 'undefined') {\n",
|
||||||
|
" require.undef(\"plotly\");\n",
|
||||||
|
" requirejs.config({\n",
|
||||||
|
" paths: {\n",
|
||||||
|
" 'plotly': ['https://cdn.plot.ly/plotly-2.4.1.min']\n",
|
||||||
|
" }\n",
|
||||||
|
" });\n",
|
||||||
|
" require(['plotly'], function(Plotly) {\n",
|
||||||
|
" window._Plotly = Plotly;\n",
|
||||||
|
" });\n",
|
||||||
|
" }\n",
|
||||||
|
" </script>\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"source": [
|
||||||
|
"# Import Tables\r\n",
|
||||||
|
"reviews = pd.read_csv('./ml-latest-small/ratings.csv', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter=',', engine='python')\r\n",
|
||||||
|
"movies = pd.read_csv('./ml-latest-small/movies.csv', names=['movieId', 'title', 'genres'], delimiter=',', engine='python')\r\n",
|
||||||
|
"users = pd.read_csv('./ml-latest-small/users.csv', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"# Print Table shape\r\n",
|
||||||
|
"print('Reviews shape:', reviews.shape)\r\n",
|
||||||
|
"print('Users shape:', users.shape)\r\n",
|
||||||
|
"print('Movies shape:', movies.shape)"
|
||||||
|
],
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"output_type": "stream",
|
||||||
|
"name": "stdout",
|
||||||
|
"text": [
|
||||||
|
"Reviews shape: (100836, 4)\n",
|
||||||
|
"Users shape: (610, 5)\n",
|
||||||
|
"Movies shape: (9742, 3)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"source": [
|
||||||
|
"# Drop unused Attributes\r\n",
|
||||||
|
"reviews.drop(['timestamp'], axis=1, inplace=True) # Time\r\n",
|
||||||
|
"users.drop(['zip'], axis=1, inplace=True) # Zip-Code\r\n",
|
||||||
|
"\r\n",
|
||||||
|
"# Extract the movie year from title to extra attrbute\r\n",
|
||||||
|
"movies['release_year'] = movies['title'].str.extract(r'(?:\\((\\d{4})\\))?\\s*$', expand=False)"
|
||||||
|
],
|
||||||
|
"outputs": [],
|
||||||
|
"metadata": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"orig_nbformat": 4,
|
||||||
|
"language_info": {
|
||||||
|
"name": "python",
|
||||||
|
"version": "3.8.8",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"file_extension": ".py"
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"name": "python3",
|
||||||
|
"display_name": "Python 3.8.8 64-bit"
|
||||||
|
},
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "53e4db133e7a886bd36ef8c79c0b5519f0af174d53fdba9ad5d5d94e6d9f4b55"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user