# This Notebook was exported from VS Code Jupyter # %% from IPython import get_ipython # %% # This Notebook is created with VS Code on Windows # Create python virtual environment get_ipython().system('python -m venv .venv') # If you want to use it on macOS/Linux # You may need to run sudo apt-get install python3-venv first #python3 -m venv .venv # Install Python Packages get_ipython().system('pip install --user --upgrade pip') get_ipython().system('pip install --upgrade setuptools') get_ipython().system('pip install --user seaborn') get_ipython().system('pip install --user numpy') get_ipython().system('pip install --user pandas') get_ipython().system('pip install --user matplotlib') get_ipython().system('pip install --user plotly') get_ipython().system('pip install --user nbformat') get_ipython().system('pip install --user surprise') # %% import numpy as np # maths import pandas as pd # data processing import matplotlib.pyplot as plt import seaborn as sns import os import re from plotly.offline import init_notebook_mode, iplot import plotly.graph_objs as go import plotly.offline as py py.init_notebook_mode(connected=True) import warnings warnings.filterwarnings('ignore') plt.style.use('fivethirtyeight') plt.rcParams['figure.figsize'] = [18, 8] # %% # Import Tables reviews = pd.read_csv('./ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter='::', engine='python') movies = pd.read_csv('./ml-1m/movies.dat', names=['movieId', 'title', 'genres'], delimiter='::', engine='python') users = pd.read_csv('./ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python') # Print Table shape print('Reviews shape:', reviews.shape) print('Users shape:', users.shape) print('Movies shape:', movies.shape) # %% # Drop unused Attributes reviews.drop(['timestamp'], axis=1, inplace=True) # Time users.drop(['zip'], axis=1, inplace=True) # Zip-Code # Extract the movie year from title to extra attrbute movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False) # %% # Print movie table movies.head() # %% # Changed feature values based on README_users.txt ages_map = {1: 'Under 18', 18: '18 - 24', 25: '25 - 34', 35: '35 - 44', 45: '45 - 49', 50: '50 - 55', 56: '56+'} occupations_map = {0: 'Not specified', 1: 'Academic / Educator', 2: 'Artist', 3: 'Clerical / Admin', 4: 'College / Grad Student', 5: 'Customer Service', 6: 'Doctor / Health Care', 7: 'Executive / Managerial', 8: 'Farmer', 9: 'Homemaker', 10: 'K-12 student', 11: 'Lawyer', 12: 'Programmer', 13: 'Retired', 14: 'Sales / Marketing', 15: 'Scientist', 16: 'Self-Employed', 17: 'Technician / Engineer', 18: 'Tradesman / Craftsman', 19: 'Unemployed', 20: 'Writer'} gender_map = {'M': 'Male', 'F': 'Female'} users['age'] = users['age'].map(ages_map) users['occupation'] = users['occupation'].map(occupations_map) users['gender'] = users['gender'].map(gender_map) # %% # Plot age kategories age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+'] age_counts = users['age'].value_counts().reindex(age_reindex) sns.barplot(x=age_counts.values, y=age_counts.index, palette='magma').set_title( 'Users age', fontsize=12) plt.show() # %% # Plot gender of users gender_counts = users['gender'].value_counts() colors1 = ['lightblue', 'pink'] pie = go.Pie(labels=gender_counts.index, values=gender_counts.values, marker=dict(colors=colors1), hole=0.5) layout = go.Layout(title='Gender Users', font=dict(size=12), legend=dict(orientation='h')) fig = go.Figure(data=[pie], layout=layout) py.iplot(fig) # %% # Merge reviews, movie and user dataset final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left') print('final_df shape:', final_df.shape) final_df.head() # %% final_df[final_df['age'] == '18 - 24']['title'].value_counts()[:10].to_frame() # %% # Print movie / user sum n_movies = final_df['movieId'].nunique() n_users = final_df['userId'].nunique() print('Number of movies:', n_movies) print('Number of users:', n_users) # %% # implement SVD with Python SurPRISE, a Python Recommendation Framework from surprise import Reader, Dataset, SVD, SVDpp from surprise import accuracy reader = Reader(rating_scale=(1, 5)) dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader) svd = SVD(n_factors=50) svd_plusplus = SVDpp(n_factors=50) # train with SVD trainset = dataset.build_full_trainset() svd.fit(trainset) # train with SVD++, ATTENTION this take a LONG TIME # svd_plusplus.fit(trainset) # %% # Show titels instead of ids id_2_names = dict() for idx, names in zip(movies['movieId'], movies['title']): id_2_names[idx] = names # %% # function for test set def Build_Anti_Testset4User(user_id): fill = trainset.global_mean anti_testset = list() u = trainset.to_inner_uid(user_id) # ur == users ratings user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]]) anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for i in trainset.all_items() if i not in user_items] return anti_testset # %% # Implement Top-X Chart recommender def TopXRec_SVD(user_id, num_recommender=10, latest=False): testSet = Build_Anti_Testset4User(user_id) predict = svd.test(testSet) # here you can change to SVD++ recommendation = list() for userID, movieID, actualRating, estimatedRating, _ in predict: intMovieID = int(movieID) recommendation.append((intMovieID, estimatedRating)) recommendation.sort(key=lambda x: x[1], reverse=True) movie_names = [] movie_ratings = [] for name, ratings in recommendation[:20]: movie_names.append(id_2_names[name]) movie_ratings.append(ratings) movie_dataframe = pd.DataFrame({'title': movie_names, 'rating': movie_ratings}).merge(movies[['title', 'release_year']], on='title', how='left') if latest == True: return movie_dataframe.sort_values('release_year', ascending=False)[['title', 'rating']].head(num_recommender) else: return movie_dataframe.drop('release_year', axis=1).head(num_recommender) # %% # Run Recommender TopXRec_SVD(363, num_recommender=10) TopXRec_SVD(363, num_recommender=10, latest=True) # Evaluation testset = trainset.build_anti_testset() predictions_svd = svd.test(testset) print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False)) print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))