diff --git a/export.py b/export.py new file mode 100644 index 0000000..1f6b695 --- /dev/null +++ b/export.py @@ -0,0 +1,242 @@ +# This Notebook was exported from VS Code Jupyter +# %% +from IPython import get_ipython + +# %% +# This Notebook is created with VS Code on Windows +# Create python virtual environment +get_ipython().system('python -m venv .venv') +# If you want to use it on macOS/Linux +# You may need to run sudo apt-get install python3-venv first +#python3 -m venv .venv + +# Install Python Packages +get_ipython().system('pip install --user --upgrade pip') +get_ipython().system('pip install --upgrade setuptools') +get_ipython().system('pip install --user seaborn') +get_ipython().system('pip install --user numpy') +get_ipython().system('pip install --user pandas') +get_ipython().system('pip install --user matplotlib') +get_ipython().system('pip install --user plotly') +get_ipython().system('pip install --user nbformat') +get_ipython().system('pip install --user surprise') + + +# %% +import numpy as np # maths +import pandas as pd # data processing +import matplotlib.pyplot as plt +import seaborn as sns +import os +import re + +from plotly.offline import init_notebook_mode, iplot +import plotly.graph_objs as go +import plotly.offline as py +py.init_notebook_mode(connected=True) + +import warnings +warnings.filterwarnings('ignore') + +plt.style.use('fivethirtyeight') +plt.rcParams['figure.figsize'] = [18, 8] + + +# %% +# Import Tables +reviews = pd.read_csv('./ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter='::', engine='python') +movies = pd.read_csv('./ml-1m/movies.dat', names=['movieId', 'title', 'genres'], delimiter='::', engine='python') +users = pd.read_csv('./ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python') + +# Print Table shape +print('Reviews shape:', reviews.shape) +print('Users shape:', users.shape) +print('Movies shape:', movies.shape) + + +# %% +# Drop unused Attributes +reviews.drop(['timestamp'], axis=1, inplace=True) # Time +users.drop(['zip'], axis=1, inplace=True) # Zip-Code + +# Extract the movie year from title to extra attrbute +movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False) + + +# %% +# Print movie table +movies.head() + + +# %% +# Changed feature values based on README_users.txt +ages_map = {1: 'Under 18', + 18: '18 - 24', + 25: '25 - 34', + 35: '35 - 44', + 45: '45 - 49', + 50: '50 - 55', + 56: '56+'} + +occupations_map = {0: 'Not specified', + 1: 'Academic / Educator', + 2: 'Artist', + 3: 'Clerical / Admin', + 4: 'College / Grad Student', + 5: 'Customer Service', + 6: 'Doctor / Health Care', + 7: 'Executive / Managerial', + 8: 'Farmer', + 9: 'Homemaker', + 10: 'K-12 student', + 11: 'Lawyer', + 12: 'Programmer', + 13: 'Retired', + 14: 'Sales / Marketing', + 15: 'Scientist', + 16: 'Self-Employed', + 17: 'Technician / Engineer', + 18: 'Tradesman / Craftsman', + 19: 'Unemployed', + 20: 'Writer'} + +gender_map = {'M': 'Male', 'F': 'Female'} + +users['age'] = users['age'].map(ages_map) +users['occupation'] = users['occupation'].map(occupations_map) +users['gender'] = users['gender'].map(gender_map) + + +# %% +# Plot age kategories + +age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+'] +age_counts = users['age'].value_counts().reindex(age_reindex) +sns.barplot(x=age_counts.values, + y=age_counts.index, + palette='magma').set_title( + 'Users age', fontsize=12) + +plt.show() + + +# %% +# Plot gender of users +gender_counts = users['gender'].value_counts() +colors1 = ['lightblue', 'pink'] +pie = go.Pie(labels=gender_counts.index, + values=gender_counts.values, + marker=dict(colors=colors1), + hole=0.5) +layout = go.Layout(title='Gender Users', font=dict(size=12), legend=dict(orientation='h')) + +fig = go.Figure(data=[pie], layout=layout) +py.iplot(fig) + + +# %% +# Merge reviews, movie and user dataset +final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left') +print('final_df shape:', final_df.shape) +final_df.head() + + +# %% +final_df[final_df['age'] == '18 - 24']['title'].value_counts()[:10].to_frame() + + +# %% +# Print movie / user sum +n_movies = final_df['movieId'].nunique() +n_users = final_df['userId'].nunique() + +print('Number of movies:', n_movies) +print('Number of users:', n_users) + + +# %% +# implement SVD with Python SurPRISE, a Python Recommendation Framework + +from surprise import Reader, Dataset, SVD, SVDpp +from surprise import accuracy + +reader = Reader(rating_scale=(1, 5)) +dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader) + +svd = SVD(n_factors=50) +svd_plusplus = SVDpp(n_factors=50) + +# train with SVD +trainset = dataset.build_full_trainset() +svd.fit(trainset) +# train with SVD++, ATTENTION this take a LONG TIME +# svd_plusplus.fit(trainset) + + +# %% +# Show titels instead of ids +id_2_names = dict() +for idx, names in zip(movies['movieId'], movies['title']): + id_2_names[idx] = names + + +# %% +# function for test set +def Build_Anti_Testset4User(user_id): + + fill = trainset.global_mean + anti_testset = list() + u = trainset.to_inner_uid(user_id) + + # ur == users ratings + user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]]) + + anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for + i in trainset.all_items() if i not in user_items] + + return anti_testset + + +# %% +# Implement Top-X Chart recommender +def TopXRec_SVD(user_id, num_recommender=10, latest=False): + + testSet = Build_Anti_Testset4User(user_id) + predict = svd.test(testSet) # here you can change to SVD++ + + recommendation = list() + + for userID, movieID, actualRating, estimatedRating, _ in predict: + intMovieID = int(movieID) + recommendation.append((intMovieID, estimatedRating)) + + recommendation.sort(key=lambda x: x[1], reverse=True) + + movie_names = [] + movie_ratings = [] + + for name, ratings in recommendation[:20]: + movie_names.append(id_2_names[name]) + movie_ratings.append(ratings) + + movie_dataframe = pd.DataFrame({'title': movie_names, + 'rating': movie_ratings}).merge(movies[['title', 'release_year']], + on='title', how='left') + + if latest == True: + return movie_dataframe.sort_values('release_year', ascending=False)[['title', 'rating']].head(num_recommender) + + else: + return movie_dataframe.drop('release_year', axis=1).head(num_recommender) + + +# %% +# Run Recommender +TopXRec_SVD(363, num_recommender=10) +TopXRec_SVD(363, num_recommender=10, latest=True) + +# Evaluation +testset = trainset.build_anti_testset() +predictions_svd = svd.test(testset) +print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False)) +print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))