final version for academic assignment

export for academic assignment
2021-08-31 22:07:33 +02:00 · 2021-08-31 22:06:54 +02:00
2 changed files with 600 additions and 156 deletions
--- a/export.py
+++ b/export.py
@@ -0,0 +1,242 @@
+# This Notebook was exported from VS Code Jupyter
+# %%
+from IPython import get_ipython
+
+# %%
+# This Notebook is created with VS Code on Windows
+# Create python virtual environment
+get_ipython().system('python -m venv .venv')
+# If you want to use it on macOS/Linux
+# You may need to run sudo apt-get install python3-venv first
+#python3 -m venv .venv
+
+# Install Python Packages
+get_ipython().system('pip install --user --upgrade pip')
+get_ipython().system('pip install --upgrade setuptools')
+get_ipython().system('pip install --user seaborn')
+get_ipython().system('pip install --user numpy')
+get_ipython().system('pip install --user pandas')
+get_ipython().system('pip install --user matplotlib')
+get_ipython().system('pip install --user plotly')
+get_ipython().system('pip install --user nbformat')
+get_ipython().system('pip install --user surprise')
+
+
+# %%
+import numpy as np # maths
+import pandas as pd # data processing
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+import re
+
+from plotly.offline import init_notebook_mode, iplot
+import plotly.graph_objs as go
+import plotly.offline as py
+py.init_notebook_mode(connected=True)
+
+import warnings
+warnings.filterwarnings('ignore')
+
+plt.style.use('fivethirtyeight')
+plt.rcParams['figure.figsize'] = [18, 8]
+
+
+# %%
+# Import Tables
+reviews = pd.read_csv('./ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter='::', engine='python')
+movies = pd.read_csv('./ml-1m/movies.dat', names=['movieId', 'title', 'genres'], delimiter='::', engine='python')
+users = pd.read_csv('./ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')
+
+# Print Table shape
+print('Reviews shape:', reviews.shape)
+print('Users shape:', users.shape)
+print('Movies shape:', movies.shape)
+
+
+# %%
+# Drop unused Attributes
+reviews.drop(['timestamp'], axis=1, inplace=True) # Time
+users.drop(['zip'], axis=1, inplace=True) # Zip-Code
+
+# Extract the movie year from title to extra attrbute
+movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)
+
+
+# %%
+# Print movie table
+movies.head()
+
+
+# %%
+# Changed feature values based on README_users.txt
+ages_map = {1: 'Under 18',
+            18: '18 - 24',
+            25: '25 - 34',
+            35: '35 - 44',
+            45: '45 - 49',
+            50: '50 - 55',
+            56: '56+'}
+
+occupations_map = {0: 'Not specified',
+                   1: 'Academic / Educator',
+                   2: 'Artist',
+                   3: 'Clerical / Admin',
+                   4: 'College / Grad Student',
+                   5: 'Customer Service',
+                   6: 'Doctor / Health Care',
+                   7: 'Executive / Managerial',
+                   8: 'Farmer',
+                   9: 'Homemaker',
+                   10: 'K-12 student',
+                   11: 'Lawyer',
+                   12: 'Programmer',
+                   13: 'Retired',
+                   14: 'Sales / Marketing',
+                   15: 'Scientist',
+                   16: 'Self-Employed',
+                   17: 'Technician / Engineer',
+                   18: 'Tradesman / Craftsman',
+                   19: 'Unemployed',
+                   20: 'Writer'}
+
+gender_map = {'M': 'Male', 'F': 'Female'}
+
+users['age'] = users['age'].map(ages_map)
+users['occupation'] = users['occupation'].map(occupations_map)
+users['gender'] = users['gender'].map(gender_map)
+
+
+# %%
+# Plot age kategories
+
+age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+']
+age_counts = users['age'].value_counts().reindex(age_reindex)
+sns.barplot(x=age_counts.values,
+            y=age_counts.index,
+            palette='magma').set_title(
+                'Users age', fontsize=12)
+
+plt.show()
+
+
+# %%
+# Plot gender of users
+gender_counts = users['gender'].value_counts()
+colors1 = ['lightblue', 'pink']
+pie = go.Pie(labels=gender_counts.index,
+             values=gender_counts.values,
+             marker=dict(colors=colors1),
+             hole=0.5)
+layout = go.Layout(title='Gender Users', font=dict(size=12), legend=dict(orientation='h'))
+
+fig = go.Figure(data=[pie], layout=layout)
+py.iplot(fig)
+
+
+# %%
+# Merge reviews, movie and user dataset
+final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')
+print('final_df shape:', final_df.shape)
+final_df.head()
+
+
+# %%
+final_df[final_df['age'] == '18 - 24']['title'].value_counts()[:10].to_frame()
+
+
+# %%
+# Print movie / user sum
+n_movies = final_df['movieId'].nunique()
+n_users = final_df['userId'].nunique()
+
+print('Number of movies:', n_movies)
+print('Number of users:', n_users)                                
+
+
+# %%
+# implement SVD with Python SurPRISE, a Python Recommendation Framework
+
+from surprise import Reader, Dataset, SVD, SVDpp
+from surprise import accuracy
+
+reader = Reader(rating_scale=(1, 5))
+dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader)
+
+svd = SVD(n_factors=50)
+svd_plusplus = SVDpp(n_factors=50)
+
+# train with SVD
+trainset = dataset.build_full_trainset()
+svd.fit(trainset)
+# train with SVD++, ATTENTION this take a LONG TIME
+# svd_plusplus.fit(trainset)
+
+
+# %%
+# Show titels instead of ids
+id_2_names = dict()
+for idx, names in zip(movies['movieId'], movies['title']):
+    id_2_names[idx] = names
+
+
+# %%
+# function for test set
+def Build_Anti_Testset4User(user_id):
+    
+    fill = trainset.global_mean
+    anti_testset = list()
+    u = trainset.to_inner_uid(user_id)
+    
+    # ur == users ratings
+    user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]])
+    
+    anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
+                            i in trainset.all_items() if i not in user_items]
+    
+    return anti_testset
+
+
+# %%
+# Implement Top-X Chart recommender
+def TopXRec_SVD(user_id, num_recommender=10, latest=False):
+    
+    testSet = Build_Anti_Testset4User(user_id)
+    predict = svd.test(testSet)  # here you can change to SVD++
+    
+    recommendation = list()
+    
+    for userID, movieID, actualRating, estimatedRating, _ in predict:
+        intMovieID = int(movieID)
+        recommendation.append((intMovieID, estimatedRating))
+        
+    recommendation.sort(key=lambda x: x[1], reverse=True)
+    
+    movie_names = []
+    movie_ratings = []
+    
+    for name, ratings in recommendation[:20]:
+        movie_names.append(id_2_names[name])
+        movie_ratings.append(ratings)
+        
+    movie_dataframe =  pd.DataFrame({'title': movie_names,
+                                     'rating': movie_ratings}).merge(movies[['title', 'release_year']],
+                                            on='title', how='left')
+    
+    if latest == True:
+        return movie_dataframe.sort_values('release_year', ascending=False)[['title', 'rating']].head(num_recommender)
+    
+    else:
+        return movie_dataframe.drop('release_year', axis=1).head(num_recommender)
+
+
+# %%
+# Run Recommender
+TopXRec_SVD(363, num_recommender=10)
+TopXRec_SVD(363, num_recommender=10, latest=True)
+
+# Evaluation
+testset = trainset.build_anti_testset()
+predictions_svd = svd.test(testset)
+print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False))
+print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))
--- a/test.ipynb
+++ b/test.ipynb
Author	SHA1	Message	Date
Oli	acb1109781	final version for academic assignment	2021-08-31 22:07:33 +02:00
Oli	bedfef2605	export for academic assignment	2021-08-31 22:06:54 +02:00