In [17]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import math
#column headers for the dataset
data_cols = ['user id','movie id','rating','timestamp']
item_cols = ['movie id','movie title','release date','video release date','IMDb URL','unknown','Action',
'Adventure','Animation','Childrens','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror',
'Musical','Mystery','Romance ','Sci-Fi','Thriller','War' ,'Western']
user_cols = ['user id','age','gender','occupation','zip code']
#importing the data files onto dataframes
data_df = pd.read_csv('ml-100k/u.data', sep='\t', names=data_cols, encoding='latin-1')
item_df = pd.read_csv('ml-100k/u.item', sep='|', names=item_cols, encoding='latin-1')
user_df = pd.read_csv('ml-100k/u.user', sep='|', names=user_cols, encoding='latin-1')
#dropping unecessary columns
#Voting Timestamp - Removed
data_df.drop(data_df.columns[[3]], axis = 1, inplace = True)
#Movie Title, Video Release Date and IMDB URL - Removed
item_df.drop(item_df.columns[[1,3,4]], axis = 1, inplace = True)
#Occupation and Zip Code - Removed
user_df.drop(user_df.columns[[3,4]], axis = 1, inplace = True)
In [18]:
print(data_df.head())
In [19]:
print(item_df.head())
In [20]:
#Ajust release date to get only the year
item_df['release date'] = pd.to_datetime(item_df['release date'], errors='coerce').dt.year
In [21]:
print(item_df.head())
In [22]:
print(user_df.head())
In [23]:
#Convert Gender column to numeric
user_df['gender'].replace('F', 1,inplace=True)
user_df['gender'].replace('M', 2,inplace=True)
In [24]:
#Adjust columns replacing NaN with the mean
meanYear = int(round(item_df['release date'].mean()))
print(meanYear)
In [25]:
item_df['release date'] = item_df['release date'].fillna(meanYear)
In [26]:
print(item_df['release date'].hasnans)
In [27]:
#merge it all
data_item = pd.merge(data_df, item_df, left_on = "movie id", right_on = "movie id")
data_item_user = pd.merge(data_item, user_df, left_on = "user id", right_on = "user id")
dataset = data_item_user
In [28]:
print(dataset.head())
In [29]:
# Data distribution
display(dataset.describe())
In [30]:
# Show the current Dataset Structure
from IPython.display import display
display(dataset)
In [ ]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
# Calculate Feature Relevance to the Dataset
for name, values in dataset.iteritems():
# Clone Dataset for Feature Relevance calculation
backupData = dataset.copy()
# Clone the Column to be predicted
y = backupData[name].copy()
# Drop column, that will be used for prediction
X = backupData.drop(name, 1)
# Split Data for Model calibration
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=72)
regressor = DecisionTreeRegressor(random_state=72)
regressor.fit(X_train, y_train)
score = regressor.score(X_test, y_test)
print('Genre {} score = {}').format(name, score)
# User Id and Movie Id have a weak relation within the other features, but
In [ ]:
# Based on Feature Relevance we are going to
# 1st Remove the user id from the dataset
smartdata = dataset.copy()
smartdata.drop(smartdata.columns[[0]], axis = 1, inplace = True)
print(smartdata.head())
In [ ]:
# 2nd Lets translate the Ratingo into a more discrete evalution (like and dislike)
# 1 - 2.9 : DILIKE
# 3 - 5 : LIKE
for name, values in smartdata['rating'].iteritems():
print(values)
In [32]:
# Produce a scatter matrix for each pair of features in the data
pd.plotting.scatter_matrix(dataset, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
In [ ]:
# Feature Scaling
# Scale the data using the natural logarithm
log_data = np.log(dataset)
# Scale the sample data using the natural logarithm
log_samples = np.log(samples)
# Produce a scatter matrix for each pair of newly-transformed features
pd.scatter_matrix(log_data, alpha = 0.3, figsize = (14,8), diagonal = 'kde');
In [ ]:
item_list = (((pd.merge(item,data).sort_values(by = 'movie id')).groupby('movie title')))['movie id', 'movie title', 'rating']
item_list = item_list.mean()
item_list['movie title'] = item_list.index
item_list = item_list.as_matrix()
In [ ]:
recommendation_list = []
for i in recommendation:
recommendation_list.append(item_list[i-1])
recommendation = (pd.DataFrame(recommendation_list,columns = ['movie id','mean rating' ,'movie title'])).sort_values(by = 'mean rating', ascending = False)
print(recommendation[['mean rating','movie title']])
In [ ]: