Recommender_System/export.py

# This Notebook was exported from VS Code Jupyter
# %%
from IPython import get_ipython

# %%
# This Notebook is created with VS Code on Windows
# Create python virtual environment
get_ipython().system('python -m venv .venv')
# If you want to use it on macOS/Linux
# You may need to run sudo apt-get install python3-venv first
#python3 -m venv .venv

# Install Python Packages
get_ipython().system('pip install --user --upgrade pip')
get_ipython().system('pip install --upgrade setuptools')
get_ipython().system('pip install --user seaborn')
get_ipython().system('pip install --user numpy')
get_ipython().system('pip install --user pandas')
get_ipython().system('pip install --user matplotlib')
get_ipython().system('pip install --user plotly')
get_ipython().system('pip install --user nbformat')
get_ipython().system('pip install --user surprise')


# %%
import numpy as np # maths
import pandas as pd # data processing
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = [18, 8]


# %%
# Import Tables
reviews = pd.read_csv('./ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter='::', engine='python')
movies = pd.read_csv('./ml-1m/movies.dat', names=['movieId', 'title', 'genres'], delimiter='::', engine='python')
users = pd.read_csv('./ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')

# Print Table shape
print('Reviews shape:', reviews.shape)
print('Users shape:', users.shape)
print('Movies shape:', movies.shape)


# %%
# Drop unused Attributes
reviews.drop(['timestamp'], axis=1, inplace=True) # Time
users.drop(['zip'], axis=1, inplace=True) # Zip-Code

# Extract the movie year from title to extra attrbute
movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)


# %%
# Print movie table
movies.head()


# %%
# Changed feature values based on README_users.txt
ages_map = {1: 'Under 18',
            18: '18 - 24',
            25: '25 - 34',
            35: '35 - 44',
            45: '45 - 49',
            50: '50 - 55',
            56: '56+'}

occupations_map = {0: 'Not specified',
                   1: 'Academic / Educator',
                   2: 'Artist',
                   3: 'Clerical / Admin',
                   4: 'College / Grad Student',
                   5: 'Customer Service',
                   6: 'Doctor / Health Care',
                   7: 'Executive / Managerial',
                   8: 'Farmer',
                   9: 'Homemaker',
                   10: 'K-12 student',
                   11: 'Lawyer',
                   12: 'Programmer',
                   13: 'Retired',
                   14: 'Sales / Marketing',
                   15: 'Scientist',
                   16: 'Self-Employed',
                   17: 'Technician / Engineer',
                   18: 'Tradesman / Craftsman',
                   19: 'Unemployed',
                   20: 'Writer'}

gender_map = {'M': 'Male', 'F': 'Female'}

users['age'] = users['age'].map(ages_map)
users['occupation'] = users['occupation'].map(occupations_map)
users['gender'] = users['gender'].map(gender_map)


# %%
# Plot age kategories

age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+']
age_counts = users['age'].value_counts().reindex(age_reindex)
sns.barplot(x=age_counts.values,
            y=age_counts.index,
            palette='magma').set_title(
                'Users age', fontsize=12)

plt.show()


# %%
# Plot gender of users
gender_counts = users['gender'].value_counts()
colors1 = ['lightblue', 'pink']
pie = go.Pie(labels=gender_counts.index,
             values=gender_counts.values,
             marker=dict(colors=colors1),
             hole=0.5)
layout = go.Layout(title='Gender Users', font=dict(size=12), legend=dict(orientation='h'))

fig = go.Figure(data=[pie], layout=layout)
py.iplot(fig)


# %%
# Merge reviews, movie and user dataset
final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')
print('final_df shape:', final_df.shape)
final_df.head()


# %%
final_df[final_df['age'] == '18 - 24']['title'].value_counts()[:10].to_frame()


# %%
# Print movie / user sum
n_movies = final_df['movieId'].nunique()
n_users = final_df['userId'].nunique()

print('Number of movies:', n_movies)
print('Number of users:', n_users)


# %%
# implement SVD with Python SurPRISE, a Python Recommendation Framework

from surprise import Reader, Dataset, SVD, SVDpp
from surprise import accuracy

reader = Reader(rating_scale=(1, 5))
dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader)

svd = SVD(n_factors=50)
svd_plusplus = SVDpp(n_factors=50)

# train with SVD
trainset = dataset.build_full_trainset()
svd.fit(trainset)
# train with SVD++, ATTENTION this take a LONG TIME
# svd_plusplus.fit(trainset)


# %%
# Show titels instead of ids
id_2_names = dict()
for idx, names in zip(movies['movieId'], movies['title']):
    id_2_names[idx] = names


# %%
# function for test set
def Build_Anti_Testset4User(user_id):

    fill = trainset.global_mean
    anti_testset = list()
    u = trainset.to_inner_uid(user_id)

    # ur == users ratings
    user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]])

    anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
                            i in trainset.all_items() if i not in user_items]

    return anti_testset


# %%
# Implement Top-X Chart recommender
def TopXRec_SVD(user_id, num_recommender=10, latest=False):

    testSet = Build_Anti_Testset4User(user_id)
    predict = svd.test(testSet)  # here you can change to SVD++

    recommendation = list()

    for userID, movieID, actualRating, estimatedRating, _ in predict:
        intMovieID = int(movieID)
        recommendation.append((intMovieID, estimatedRating))

    recommendation.sort(key=lambda x: x[1], reverse=True)

    movie_names = []
    movie_ratings = []

    for name, ratings in recommendation[:20]:
        movie_names.append(id_2_names[name])
        movie_ratings.append(ratings)

    movie_dataframe =  pd.DataFrame({'title': movie_names,
                                     'rating': movie_ratings}).merge(movies[['title', 'release_year']],
                                            on='title', how='left')

    if latest == True:
        return movie_dataframe.sort_values('release_year', ascending=False)[['title', 'rating']].head(num_recommender)

    else:
        return movie_dataframe.drop('release_year', axis=1).head(num_recommender)


# %%
# Run Recommender
TopXRec_SVD(363, num_recommender=10)
TopXRec_SVD(363, num_recommender=10, latest=True)

# Evaluation
testset = trainset.build_anti_testset()
predictions_svd = svd.test(testset)
print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False))
print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))