Files
Recommender_System/export.py
2021-08-31 22:06:54 +02:00

243 lines
7.1 KiB
Python

# This Notebook was exported from VS Code Jupyter
# %%
from IPython import get_ipython
# %%
# This Notebook is created with VS Code on Windows
# Create python virtual environment
get_ipython().system('python -m venv .venv')
# If you want to use it on macOS/Linux
# You may need to run sudo apt-get install python3-venv first
#python3 -m venv .venv
# Install Python Packages
get_ipython().system('pip install --user --upgrade pip')
get_ipython().system('pip install --upgrade setuptools')
get_ipython().system('pip install --user seaborn')
get_ipython().system('pip install --user numpy')
get_ipython().system('pip install --user pandas')
get_ipython().system('pip install --user matplotlib')
get_ipython().system('pip install --user plotly')
get_ipython().system('pip install --user nbformat')
get_ipython().system('pip install --user surprise')
# %%
import numpy as np # maths
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [18, 8]
# %%
# Import Tables
reviews = pd.read_csv('./ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter='::', engine='python')
movies = pd.read_csv('./ml-1m/movies.dat', names=['movieId', 'title', 'genres'], delimiter='::', engine='python')
users = pd.read_csv('./ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')
# Print Table shape
print('Reviews shape:', reviews.shape)
print('Users shape:', users.shape)
print('Movies shape:', movies.shape)
# %%
# Drop unused Attributes
reviews.drop(['timestamp'], axis=1, inplace=True) # Time
users.drop(['zip'], axis=1, inplace=True) # Zip-Code
# Extract the movie year from title to extra attrbute
movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)
# %%
# Print movie table
movies.head()
# %%
# Changed feature values based on README_users.txt
ages_map = {1: 'Under 18',
18: '18 - 24',
25: '25 - 34',
35: '35 - 44',
45: '45 - 49',
50: '50 - 55',
56: '56+'}
occupations_map = {0: 'Not specified',
1: 'Academic / Educator',
2: 'Artist',
3: 'Clerical / Admin',
4: 'College / Grad Student',
5: 'Customer Service',
6: 'Doctor / Health Care',
7: 'Executive / Managerial',
8: 'Farmer',
9: 'Homemaker',
10: 'K-12 student',
11: 'Lawyer',
12: 'Programmer',
13: 'Retired',
14: 'Sales / Marketing',
15: 'Scientist',
16: 'Self-Employed',
17: 'Technician / Engineer',
18: 'Tradesman / Craftsman',
19: 'Unemployed',
20: 'Writer'}
gender_map = {'M': 'Male', 'F': 'Female'}
users['age'] = users['age'].map(ages_map)
users['occupation'] = users['occupation'].map(occupations_map)
users['gender'] = users['gender'].map(gender_map)
# %%
# Plot age kategories
age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+']
age_counts = users['age'].value_counts().reindex(age_reindex)
sns.barplot(x=age_counts.values,
y=age_counts.index,
palette='magma').set_title(
'Users age', fontsize=12)
plt.show()
# %%
# Plot gender of users
gender_counts = users['gender'].value_counts()
colors1 = ['lightblue', 'pink']
pie = go.Pie(labels=gender_counts.index,
values=gender_counts.values,
marker=dict(colors=colors1),
hole=0.5)
layout = go.Layout(title='Gender Users', font=dict(size=12), legend=dict(orientation='h'))
fig = go.Figure(data=[pie], layout=layout)
py.iplot(fig)
# %%
# Merge reviews, movie and user dataset
final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')
print('final_df shape:', final_df.shape)
final_df.head()
# %%
final_df[final_df['age'] == '18 - 24']['title'].value_counts()[:10].to_frame()
# %%
# Print movie / user sum
n_movies = final_df['movieId'].nunique()
n_users = final_df['userId'].nunique()
print('Number of movies:', n_movies)
print('Number of users:', n_users)
# %%
# implement SVD with Python SurPRISE, a Python Recommendation Framework
from surprise import Reader, Dataset, SVD, SVDpp
from surprise import accuracy
reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader)
svd = SVD(n_factors=50)
svd_plusplus = SVDpp(n_factors=50)
# train with SVD
trainset = dataset.build_full_trainset()
svd.fit(trainset)
# train with SVD++, ATTENTION this take a LONG TIME
# svd_plusplus.fit(trainset)
# %%
# Show titels instead of ids
id_2_names = dict()
for idx, names in zip(movies['movieId'], movies['title']):
id_2_names[idx] = names
# %%
# function for test set
def Build_Anti_Testset4User(user_id):
fill = trainset.global_mean
anti_testset = list()
u = trainset.to_inner_uid(user_id)
# ur == users ratings
user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]])
anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
i in trainset.all_items() if i not in user_items]
return anti_testset
# %%
# Implement Top-X Chart recommender
def TopXRec_SVD(user_id, num_recommender=10, latest=False):
testSet = Build_Anti_Testset4User(user_id)
predict = svd.test(testSet) # here you can change to SVD++
recommendation = list()
for userID, movieID, actualRating, estimatedRating, _ in predict:
intMovieID = int(movieID)
recommendation.append((intMovieID, estimatedRating))
recommendation.sort(key=lambda x: x[1], reverse=True)
movie_names = []
movie_ratings = []
for name, ratings in recommendation[:20]:
movie_names.append(id_2_names[name])
movie_ratings.append(ratings)
movie_dataframe = pd.DataFrame({'title': movie_names,
'rating': movie_ratings}).merge(movies[['title', 'release_year']],
on='title', how='left')
if latest == True:
return movie_dataframe.sort_values('release_year', ascending=False)[['title', 'rating']].head(num_recommender)
else:
return movie_dataframe.drop('release_year', axis=1).head(num_recommender)
# %%
# Run Recommender
TopXRec_SVD(363, num_recommender=10)
TopXRec_SVD(363, num_recommender=10, latest=True)
# Evaluation
testset = trainset.build_anti_testset()
predictions_svd = svd.test(testset)
print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False))
print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))