243 lines
7.1 KiB
Python
243 lines
7.1 KiB
Python
# This Notebook was exported from VS Code Jupyter
|
|
# %%
|
|
from IPython import get_ipython
|
|
|
|
# %%
|
|
# This Notebook is created with VS Code on Windows
|
|
# Create python virtual environment
|
|
get_ipython().system('python -m venv .venv')
|
|
# If you want to use it on macOS/Linux
|
|
# You may need to run sudo apt-get install python3-venv first
|
|
#python3 -m venv .venv
|
|
|
|
# Install Python Packages
|
|
get_ipython().system('pip install --user --upgrade pip')
|
|
get_ipython().system('pip install --upgrade setuptools')
|
|
get_ipython().system('pip install --user seaborn')
|
|
get_ipython().system('pip install --user numpy')
|
|
get_ipython().system('pip install --user pandas')
|
|
get_ipython().system('pip install --user matplotlib')
|
|
get_ipython().system('pip install --user plotly')
|
|
get_ipython().system('pip install --user nbformat')
|
|
get_ipython().system('pip install --user surprise')
|
|
|
|
|
|
# %%
|
|
import numpy as np # maths
|
|
import pandas as pd # data processing
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
import os
|
|
import re
|
|
|
|
from plotly.offline import init_notebook_mode, iplot
|
|
import plotly.graph_objs as go
|
|
import plotly.offline as py
|
|
py.init_notebook_mode(connected=True)
|
|
|
|
import warnings
|
|
warnings.filterwarnings('ignore')
|
|
|
|
plt.style.use('fivethirtyeight')
|
|
plt.rcParams['figure.figsize'] = [18, 8]
|
|
|
|
|
|
# %%
|
|
# Import Tables
|
|
reviews = pd.read_csv('./ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter='::', engine='python')
|
|
movies = pd.read_csv('./ml-1m/movies.dat', names=['movieId', 'title', 'genres'], delimiter='::', engine='python')
|
|
users = pd.read_csv('./ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')
|
|
|
|
# Print Table shape
|
|
print('Reviews shape:', reviews.shape)
|
|
print('Users shape:', users.shape)
|
|
print('Movies shape:', movies.shape)
|
|
|
|
|
|
# %%
|
|
# Drop unused Attributes
|
|
reviews.drop(['timestamp'], axis=1, inplace=True) # Time
|
|
users.drop(['zip'], axis=1, inplace=True) # Zip-Code
|
|
|
|
# Extract the movie year from title to extra attrbute
|
|
movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)
|
|
|
|
|
|
# %%
|
|
# Print movie table
|
|
movies.head()
|
|
|
|
|
|
# %%
|
|
# Changed feature values based on README_users.txt
|
|
ages_map = {1: 'Under 18',
|
|
18: '18 - 24',
|
|
25: '25 - 34',
|
|
35: '35 - 44',
|
|
45: '45 - 49',
|
|
50: '50 - 55',
|
|
56: '56+'}
|
|
|
|
occupations_map = {0: 'Not specified',
|
|
1: 'Academic / Educator',
|
|
2: 'Artist',
|
|
3: 'Clerical / Admin',
|
|
4: 'College / Grad Student',
|
|
5: 'Customer Service',
|
|
6: 'Doctor / Health Care',
|
|
7: 'Executive / Managerial',
|
|
8: 'Farmer',
|
|
9: 'Homemaker',
|
|
10: 'K-12 student',
|
|
11: 'Lawyer',
|
|
12: 'Programmer',
|
|
13: 'Retired',
|
|
14: 'Sales / Marketing',
|
|
15: 'Scientist',
|
|
16: 'Self-Employed',
|
|
17: 'Technician / Engineer',
|
|
18: 'Tradesman / Craftsman',
|
|
19: 'Unemployed',
|
|
20: 'Writer'}
|
|
|
|
gender_map = {'M': 'Male', 'F': 'Female'}
|
|
|
|
users['age'] = users['age'].map(ages_map)
|
|
users['occupation'] = users['occupation'].map(occupations_map)
|
|
users['gender'] = users['gender'].map(gender_map)
|
|
|
|
|
|
# %%
|
|
# Plot age kategories
|
|
|
|
age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+']
|
|
age_counts = users['age'].value_counts().reindex(age_reindex)
|
|
sns.barplot(x=age_counts.values,
|
|
y=age_counts.index,
|
|
palette='magma').set_title(
|
|
'Users age', fontsize=12)
|
|
|
|
plt.show()
|
|
|
|
|
|
# %%
|
|
# Plot gender of users
|
|
gender_counts = users['gender'].value_counts()
|
|
colors1 = ['lightblue', 'pink']
|
|
pie = go.Pie(labels=gender_counts.index,
|
|
values=gender_counts.values,
|
|
marker=dict(colors=colors1),
|
|
hole=0.5)
|
|
layout = go.Layout(title='Gender Users', font=dict(size=12), legend=dict(orientation='h'))
|
|
|
|
fig = go.Figure(data=[pie], layout=layout)
|
|
py.iplot(fig)
|
|
|
|
|
|
# %%
|
|
# Merge reviews, movie and user dataset
|
|
final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')
|
|
print('final_df shape:', final_df.shape)
|
|
final_df.head()
|
|
|
|
|
|
# %%
|
|
final_df[final_df['age'] == '18 - 24']['title'].value_counts()[:10].to_frame()
|
|
|
|
|
|
# %%
|
|
# Print movie / user sum
|
|
n_movies = final_df['movieId'].nunique()
|
|
n_users = final_df['userId'].nunique()
|
|
|
|
print('Number of movies:', n_movies)
|
|
print('Number of users:', n_users)
|
|
|
|
|
|
# %%
|
|
# implement SVD with Python SurPRISE, a Python Recommendation Framework
|
|
|
|
from surprise import Reader, Dataset, SVD, SVDpp
|
|
from surprise import accuracy
|
|
|
|
reader = Reader(rating_scale=(1, 5))
|
|
dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader)
|
|
|
|
svd = SVD(n_factors=50)
|
|
svd_plusplus = SVDpp(n_factors=50)
|
|
|
|
# train with SVD
|
|
trainset = dataset.build_full_trainset()
|
|
svd.fit(trainset)
|
|
# train with SVD++, ATTENTION this take a LONG TIME
|
|
# svd_plusplus.fit(trainset)
|
|
|
|
|
|
# %%
|
|
# Show titels instead of ids
|
|
id_2_names = dict()
|
|
for idx, names in zip(movies['movieId'], movies['title']):
|
|
id_2_names[idx] = names
|
|
|
|
|
|
# %%
|
|
# function for test set
|
|
def Build_Anti_Testset4User(user_id):
|
|
|
|
fill = trainset.global_mean
|
|
anti_testset = list()
|
|
u = trainset.to_inner_uid(user_id)
|
|
|
|
# ur == users ratings
|
|
user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]])
|
|
|
|
anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
|
|
i in trainset.all_items() if i not in user_items]
|
|
|
|
return anti_testset
|
|
|
|
|
|
# %%
|
|
# Implement Top-X Chart recommender
|
|
def TopXRec_SVD(user_id, num_recommender=10, latest=False):
|
|
|
|
testSet = Build_Anti_Testset4User(user_id)
|
|
predict = svd.test(testSet) # here you can change to SVD++
|
|
|
|
recommendation = list()
|
|
|
|
for userID, movieID, actualRating, estimatedRating, _ in predict:
|
|
intMovieID = int(movieID)
|
|
recommendation.append((intMovieID, estimatedRating))
|
|
|
|
recommendation.sort(key=lambda x: x[1], reverse=True)
|
|
|
|
movie_names = []
|
|
movie_ratings = []
|
|
|
|
for name, ratings in recommendation[:20]:
|
|
movie_names.append(id_2_names[name])
|
|
movie_ratings.append(ratings)
|
|
|
|
movie_dataframe = pd.DataFrame({'title': movie_names,
|
|
'rating': movie_ratings}).merge(movies[['title', 'release_year']],
|
|
on='title', how='left')
|
|
|
|
if latest == True:
|
|
return movie_dataframe.sort_values('release_year', ascending=False)[['title', 'rating']].head(num_recommender)
|
|
|
|
else:
|
|
return movie_dataframe.drop('release_year', axis=1).head(num_recommender)
|
|
|
|
|
|
# %%
|
|
# Run Recommender
|
|
TopXRec_SVD(363, num_recommender=10)
|
|
TopXRec_SVD(363, num_recommender=10, latest=True)
|
|
|
|
# Evaluation
|
|
testset = trainset.build_anti_testset()
|
|
predictions_svd = svd.test(testset)
|
|
print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False))
|
|
print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))
|