Compare commits
2 Commits
7e457e0cb1
...
acb1109781
| Author | SHA1 | Date | |
|---|---|---|---|
| acb1109781 | |||
| bedfef2605 |
242
export.py
Normal file
242
export.py
Normal file
@@ -0,0 +1,242 @@
|
||||
# This Notebook was exported from VS Code Jupyter
|
||||
# %%
|
||||
from IPython import get_ipython
|
||||
|
||||
# %%
|
||||
# This Notebook is created with VS Code on Windows
|
||||
# Create python virtual environment
|
||||
get_ipython().system('python -m venv .venv')
|
||||
# If you want to use it on macOS/Linux
|
||||
# You may need to run sudo apt-get install python3-venv first
|
||||
#python3 -m venv .venv
|
||||
|
||||
# Install Python Packages
|
||||
get_ipython().system('pip install --user --upgrade pip')
|
||||
get_ipython().system('pip install --upgrade setuptools')
|
||||
get_ipython().system('pip install --user seaborn')
|
||||
get_ipython().system('pip install --user numpy')
|
||||
get_ipython().system('pip install --user pandas')
|
||||
get_ipython().system('pip install --user matplotlib')
|
||||
get_ipython().system('pip install --user plotly')
|
||||
get_ipython().system('pip install --user nbformat')
|
||||
get_ipython().system('pip install --user surprise')
|
||||
|
||||
|
||||
# %%
|
||||
import numpy as np # maths
|
||||
import pandas as pd # data processing
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import os
|
||||
import re
|
||||
|
||||
from plotly.offline import init_notebook_mode, iplot
|
||||
import plotly.graph_objs as go
|
||||
import plotly.offline as py
|
||||
py.init_notebook_mode(connected=True)
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore')
|
||||
|
||||
plt.style.use('fivethirtyeight')
|
||||
plt.rcParams['figure.figsize'] = [18, 8]
|
||||
|
||||
|
||||
# %%
|
||||
# Import Tables
|
||||
reviews = pd.read_csv('./ml-1m/ratings.dat', names=['userId', 'movieId', 'rating', 'timestamp'], delimiter='::', engine='python')
|
||||
movies = pd.read_csv('./ml-1m/movies.dat', names=['movieId', 'title', 'genres'], delimiter='::', engine='python')
|
||||
users = pd.read_csv('./ml-1m/users.dat', names=['userId', 'gender', 'age', 'occupation', 'zip'], delimiter='::', engine='python')
|
||||
|
||||
# Print Table shape
|
||||
print('Reviews shape:', reviews.shape)
|
||||
print('Users shape:', users.shape)
|
||||
print('Movies shape:', movies.shape)
|
||||
|
||||
|
||||
# %%
|
||||
# Drop unused Attributes
|
||||
reviews.drop(['timestamp'], axis=1, inplace=True) # Time
|
||||
users.drop(['zip'], axis=1, inplace=True) # Zip-Code
|
||||
|
||||
# Extract the movie year from title to extra attrbute
|
||||
movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)
|
||||
|
||||
|
||||
# %%
|
||||
# Print movie table
|
||||
movies.head()
|
||||
|
||||
|
||||
# %%
|
||||
# Changed feature values based on README_users.txt
|
||||
ages_map = {1: 'Under 18',
|
||||
18: '18 - 24',
|
||||
25: '25 - 34',
|
||||
35: '35 - 44',
|
||||
45: '45 - 49',
|
||||
50: '50 - 55',
|
||||
56: '56+'}
|
||||
|
||||
occupations_map = {0: 'Not specified',
|
||||
1: 'Academic / Educator',
|
||||
2: 'Artist',
|
||||
3: 'Clerical / Admin',
|
||||
4: 'College / Grad Student',
|
||||
5: 'Customer Service',
|
||||
6: 'Doctor / Health Care',
|
||||
7: 'Executive / Managerial',
|
||||
8: 'Farmer',
|
||||
9: 'Homemaker',
|
||||
10: 'K-12 student',
|
||||
11: 'Lawyer',
|
||||
12: 'Programmer',
|
||||
13: 'Retired',
|
||||
14: 'Sales / Marketing',
|
||||
15: 'Scientist',
|
||||
16: 'Self-Employed',
|
||||
17: 'Technician / Engineer',
|
||||
18: 'Tradesman / Craftsman',
|
||||
19: 'Unemployed',
|
||||
20: 'Writer'}
|
||||
|
||||
gender_map = {'M': 'Male', 'F': 'Female'}
|
||||
|
||||
users['age'] = users['age'].map(ages_map)
|
||||
users['occupation'] = users['occupation'].map(occupations_map)
|
||||
users['gender'] = users['gender'].map(gender_map)
|
||||
|
||||
|
||||
# %%
|
||||
# Plot age kategories
|
||||
|
||||
age_reindex = ['Under 18', '18 - 24', '25 - 34', '35 - 44', '45 - 49', '50 - 55', '56+']
|
||||
age_counts = users['age'].value_counts().reindex(age_reindex)
|
||||
sns.barplot(x=age_counts.values,
|
||||
y=age_counts.index,
|
||||
palette='magma').set_title(
|
||||
'Users age', fontsize=12)
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
# %%
|
||||
# Plot gender of users
|
||||
gender_counts = users['gender'].value_counts()
|
||||
colors1 = ['lightblue', 'pink']
|
||||
pie = go.Pie(labels=gender_counts.index,
|
||||
values=gender_counts.values,
|
||||
marker=dict(colors=colors1),
|
||||
hole=0.5)
|
||||
layout = go.Layout(title='Gender Users', font=dict(size=12), legend=dict(orientation='h'))
|
||||
|
||||
fig = go.Figure(data=[pie], layout=layout)
|
||||
py.iplot(fig)
|
||||
|
||||
|
||||
# %%
|
||||
# Merge reviews, movie and user dataset
|
||||
final_df = reviews.merge(movies, on='movieId', how='left').merge(users, on='userId', how='left')
|
||||
print('final_df shape:', final_df.shape)
|
||||
final_df.head()
|
||||
|
||||
|
||||
# %%
|
||||
final_df[final_df['age'] == '18 - 24']['title'].value_counts()[:10].to_frame()
|
||||
|
||||
|
||||
# %%
|
||||
# Print movie / user sum
|
||||
n_movies = final_df['movieId'].nunique()
|
||||
n_users = final_df['userId'].nunique()
|
||||
|
||||
print('Number of movies:', n_movies)
|
||||
print('Number of users:', n_users)
|
||||
|
||||
|
||||
# %%
|
||||
# implement SVD with Python SurPRISE, a Python Recommendation Framework
|
||||
|
||||
from surprise import Reader, Dataset, SVD, SVDpp
|
||||
from surprise import accuracy
|
||||
|
||||
reader = Reader(rating_scale=(1, 5))
|
||||
dataset = Dataset.load_from_df(final_df[['userId', 'movieId', 'rating']], reader=reader)
|
||||
|
||||
svd = SVD(n_factors=50)
|
||||
svd_plusplus = SVDpp(n_factors=50)
|
||||
|
||||
# train with SVD
|
||||
trainset = dataset.build_full_trainset()
|
||||
svd.fit(trainset)
|
||||
# train with SVD++, ATTENTION this take a LONG TIME
|
||||
# svd_plusplus.fit(trainset)
|
||||
|
||||
|
||||
# %%
|
||||
# Show titels instead of ids
|
||||
id_2_names = dict()
|
||||
for idx, names in zip(movies['movieId'], movies['title']):
|
||||
id_2_names[idx] = names
|
||||
|
||||
|
||||
# %%
|
||||
# function for test set
|
||||
def Build_Anti_Testset4User(user_id):
|
||||
|
||||
fill = trainset.global_mean
|
||||
anti_testset = list()
|
||||
u = trainset.to_inner_uid(user_id)
|
||||
|
||||
# ur == users ratings
|
||||
user_items = set([item_inner_id for (item_inner_id, rating) in trainset.ur[u]])
|
||||
|
||||
anti_testset += [(trainset.to_raw_uid(u), trainset.to_raw_iid(i), fill) for
|
||||
i in trainset.all_items() if i not in user_items]
|
||||
|
||||
return anti_testset
|
||||
|
||||
|
||||
# %%
|
||||
# Implement Top-X Chart recommender
|
||||
def TopXRec_SVD(user_id, num_recommender=10, latest=False):
|
||||
|
||||
testSet = Build_Anti_Testset4User(user_id)
|
||||
predict = svd.test(testSet) # here you can change to SVD++
|
||||
|
||||
recommendation = list()
|
||||
|
||||
for userID, movieID, actualRating, estimatedRating, _ in predict:
|
||||
intMovieID = int(movieID)
|
||||
recommendation.append((intMovieID, estimatedRating))
|
||||
|
||||
recommendation.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
movie_names = []
|
||||
movie_ratings = []
|
||||
|
||||
for name, ratings in recommendation[:20]:
|
||||
movie_names.append(id_2_names[name])
|
||||
movie_ratings.append(ratings)
|
||||
|
||||
movie_dataframe = pd.DataFrame({'title': movie_names,
|
||||
'rating': movie_ratings}).merge(movies[['title', 'release_year']],
|
||||
on='title', how='left')
|
||||
|
||||
if latest == True:
|
||||
return movie_dataframe.sort_values('release_year', ascending=False)[['title', 'rating']].head(num_recommender)
|
||||
|
||||
else:
|
||||
return movie_dataframe.drop('release_year', axis=1).head(num_recommender)
|
||||
|
||||
|
||||
# %%
|
||||
# Run Recommender
|
||||
TopXRec_SVD(363, num_recommender=10)
|
||||
TopXRec_SVD(363, num_recommender=10, latest=True)
|
||||
|
||||
# Evaluation
|
||||
testset = trainset.build_anti_testset()
|
||||
predictions_svd = svd.test(testset)
|
||||
print('SVD - RMSE:', accuracy.rmse(predictions_svd, verbose=False))
|
||||
print('SVD - MAE:', accuracy.mae(predictions_svd, verbose=False))
|
||||
514
test.ipynb
514
test.ipynb
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user