In [33]:
%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
params = {'legend.fontsize': 'x-large',
'figure.figsize': (15, 5),
'axes.labelsize': 'x-large',
'axes.titlesize':'x-large',
'xtick.labelsize':'x-large',
'ytick.labelsize':'x-large'}
matplotlib.rcParams.update(params)
In [2]:
imdb_dat = pd.read_csv("movie_metadata.csv")
imdb_dat.info()
In [3]:
import requests
import re
from bs4 import BeautifulSoup
import time
import string
# return the douban movie rating that matches the movie name and year
# read in the movie name
def doubanRating(name):
movie_name = name.decode('gbk').encode('utf-8')
url_head = 'http://movie.douban.com/subject_search'
pageload = {'search_text': movie_name}
r = requests.get(url_head,params = pageload)
soup = BeautifulSoup(r.text,'html.parser')
first_hit = soup.find_all(class_= 'nbg')
try:
r2_link = first_hit[0].get('href')
# sometime douban returns items like celebrity instead of movies
if 'subject' not in r2_link:
r2_link = first_hit[1].get('href')
r2 = requests.get(r2_link)
soup2 = BeautifulSoup(r2.text,'html.parser')
title = soup2.find(property = "v:itemreviewed")
title = title.get_text() # in unicode
# remove Chinese characters
title = ' '.join((title.split(' '))[1:])
title = filter(lambda x:x in set(string.printable),title)
flag = True
if title != name:
print "Warning: name may not match"
flag = False
year = (soup2.find(class_='year')).get_text()# in unicode
rating = (soup2.find(class_="ll rating_num")).get_text() # in unicode
num_review = (soup2.find(property="v:votes")).get_text()
return [title, year, rating,num_review,flag]
except:
print "Record not found for: "+name
return [name, None, None, None, None]
In [4]:
#%%2. Store scrapped data
dataset = pd.read_csv("movie_metadata.csv")
total_length = 5043
#first_query = 2500
res = pd.DataFrame(columns = ('movie_title','year','rating','num_review','flag'))
for i in xrange(1,total_length):
name = dataset['movie_title'][i].strip().strip('\xc2\xa0')
res.loc[i] = doubanRating(name)
print "slowly and finally done %d query"%i
time.sleep(10)
if (i%50==0):
res.to_csv("douban_movie_review.csv")
print "saved until record: %d"%i
In [3]:
douban_dat = pd.read_csv("douban_movie_review.csv")
douban_dat.rename(columns = {'movie_title':'d_movie_title','year':'d_year','rating':'douban_score','num_review':'dnum_review','flag':'dflag'},inplace = True)
douban_dat.info()
In [4]:
res_dat = pd.concat([imdb_dat,douban_dat],axis = 1)
res_dat.info()
In [34]:
# 1. visulize the gross distribution of ratings from imdb(x-axis) and douban(y-axis)
import seaborn as sns
g = sns.jointplot(x = 'imdb_score',y = 'douban_score',data = res_dat)
g.ax_joint.set(xlim=(1, 10), ylim=(1, 10))
Out[34]:
In [35]:
# plot distribution and bar graphs(significantly different)
from scipy import stats
nbins = 15
fig,axes = plt.subplots(nrows = 1,ncols = 2, figsize = (10,8))
ax0,ax1 = axes.flatten()
ax0.hist([res_dat.douban_score,res_dat.imdb_score],nbins, histtype = 'bar',label = ["Douban","IMDB"])
ax0.set_title('The distribution of movie ratings')
ax0.set_xlabel('Rating')
ax0.set_ylabel('Count')
ax0.legend()
imdb_score = np.mean(res_dat.imdb_score)
douban_score = np.mean(res_dat.douban_score)
ax1.bar([0,1],[imdb_score, douban_score], yerr = [np.std(res_dat.imdb_score),np.std(res_dat.douban_score)],
align = 'center',color = ['green','blue'], ecolor = 'black')
ax1.set_xticks([0,1])
ax1.set_xticklabels(['IMDB','Douban'])
ax1.set_ylabel('Score')
_,p = stats.ttest_rel(res_dat['imdb_score'], res_dat['douban_score'],nan_policy = 'omit')
ax1.set_title('A comparison of ratings\n'+'t-test: p = %.4f***'%p)
#fig.tight_layout()
plt.show()
# any significant differences
In [81]:
from sklearn import preprocessing
data = res_dat.dropna()
print " delete null values, the remaining data is",data.shape
In [82]:
data.loc[:,'scaled_imdb'] = preprocessing.scale(data['imdb_score'])
data.loc[:,'scaled_douban'] = preprocessing.scale(data['douban_score'])
In [85]:
#stats.ttest_rel(data['scaled_imdb'], data['scaled_douban'],nan_policy = 'omit')
from scipy.stats import norm, lognorm
import matplotlib.mlab as mlab
fig,axes = plt.subplots(nrows = 1,ncols = 2, figsize = (10,8))
ax0,ax1 = axes.flatten()
ax0.plot(data['scaled_imdb'],data['scaled_douban'],'ro')
ax0.set_title('Normalized Scores')
ax0.set_xlabel('Scaled IMDB score')
ax0.set_ylabel('Scaled Douban score')
data.loc[:,'rating_diff'] = data['scaled_imdb'] - data['scaled_douban']
(mu,sigma) = norm.fit(data['rating_diff'])
_,bins,_ = ax1.hist(data['rating_diff'],60,normed = 1, histtype = 'bar',alpha = 0.75)
ax1.plot(bins, mlab.normpdf(bins,mu,sigma),'r--',linewidth = 2)
ax1.set_xlabel('IMDB_score - Douban_score')
ax1.set_ylabel('percentage')
ax1.set_title('Rating difference Distribution')
fig.tight_layout()
plt.show()
In [87]:
data.describe()
Out[87]:
In [88]:
data.describe(include = ['object'])
Out[88]:
In [98]:
ind = data['rating_diff'].argmax()
print data.iloc[ind].movie_title
print data.iloc[ind].scaled_imdb
print data.iloc[ind].scaled_douban
print data.iloc[ind].title_year
print data.iloc[ind].movie_imdb_link
print data.iloc[ind].d_year
print data.iloc[ind].douban_score
print data.iloc[ind].imdb_score
In [93]:
data.columns
Out[93]:
In [8]:
# 2. Predict differences in ratings
res_dat['diff_rating'] = res_dat['douban_score']-res_dat['imdb_score']
# 2.1. covert categorical variable Genre to Dummy variables
# only extract the first genre out of the list to simplify the problem
res_dat['genre1'] = res_dat.apply(lambda row:(row['genres'].split('|'))[0],axis = 1)
#res_dat['genre1'].value_counts()
# Because there are 21 genres, here we only choose the top 7 to convert to index
top_genre = ['Comedy','Action','Drama','Adventure','Crime','Biography','Horror']
# The rest of genre types we just consider them as others
res_dat['top_genre'] = res_dat.apply(lambda row:row['genre1'] if row['genre1'] in top_genre else 'Other',axis =1)
#select num_user_for_reviews ,director_facebook_likes ,actor_1_facebook_likes ,gross , genres,
#budget,# dnum_review # for EDA
res_subdat = res_dat[['top_genre','num_user_for_reviews','director_facebook_likes','actor_1_facebook_likes','gross','budget','dnum_review','diff_rating']]
res_subdat = pd.get_dummies(res_subdat,prefix =['top_genre'])
#res_dat = pd.get_dummies(res_dat,prefix = ['top_genre'])
res_subdat.shape
Out[8]:
In [9]:
# create a subset for visualization and preliminary analysis
col2 = [u'num_user_for_reviews', u'director_facebook_likes',
u'actor_1_facebook_likes', u'gross', u'budget', u'dnum_review', u'top_genre_Action', u'top_genre_Adventure',
u'top_genre_Biography', u'top_genre_Comedy', u'top_genre_Crime',
u'top_genre_Drama', u'top_genre_Horror', u'top_genre_Other',u'diff_rating']
res_subdat = res_subdat[col2]
In [10]:
# a subset for plotting correlation
col_cat = [u'gross', u'budget', u'dnum_review',u'num_user_for_reviews',u'top_genre_Action', u'top_genre_Adventure',
u'top_genre_Biography', u'top_genre_Comedy', u'top_genre_Crime',
u'top_genre_Drama', u'top_genre_Horror', u'diff_rating']
res_subdat_genre = res_subdat[col_cat]
In [11]:
# show pair-wise correlation between differences in ratings and estimators
import matplotlib.pylab as plt
import numpy as np
corr = res_subdat_genre.corr()
sns.set(style = "white")
f,ax = plt.subplots(figsize=(11,9))
cmap = sns.diverging_palette(220,10,as_cmap=True)
mask = np.zeros_like(corr,dtype = np.bool)
sns.heatmap(corr,mask = mask,cmap = cmap, vmax=.3,square = True, linewidths = .5,
cbar_kws = {"shrink": .5},ax = ax)
Out[11]:
In [12]:
# prepare trainning set and target set
col_train = col2[:len(col2)-1]
col_target = col2[len(col2)-1]
#cl_res_subdat = res_subdat.dropna(axis =0)
In [13]:
cl_res_subdat.shaperating_diff
In [14]:
# 2.2 Use Random Forest Regressor for prediction
X_cat = res_subdat.ix[:,'top_genre_Action':'top_genre_Other']
num_col = []
for i in res_dat.columns:
if res_dat[i].dtype != 'object':
num_col.append(i)
X_num = res_dat[num_col]
X = pd.concat([X_cat,X_num],axis = 1)
X = X.dropna(axis = 0)
y = X['diff_rating']
X = X.iloc[:,:-1]
X.drop(['imdb_score','douban_score'],axis = 1,inplace = True)
from sklearn.model_selection import train_test_split
# METHOD 1: BUILD randomforestregressor
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.1,random_state = 42)
In [15]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 500)
forest = rf.fit(X_train, y_train)
score_r2 = rf.score(X_val,y_val)
# print: R-sqr
print score_r2
In [16]:
rf_features = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), X.columns),reverse = True)
In [17]:
import matplotlib.pyplot as plt;
imps,feas = zip(*(rf_features[0:4]+rf_features[6:12]))
ypos = np.arange(len(feas))
plt.barh(ypos,imps,align = 'center',alpha = 0.5)
plt.yticks(ypos,feas)
plt.xlabel('Feature Importance')
Out[17]:
In [23]:
plt.subplot(1,2,1)
plt.plot(y_train,rf.predict(X_train),'o')
plt.xlabel('Training_y')
plt.ylabel('Predict_y')
plt.xlim(-6,6)
plt.ylim(-6,6)
plt.subplot(1,2,2)
plt.plot(y_val,rf.predict(X_val),'o')
plt.xlabel('val_y')
plt.ylabel('Predict_y')
plt.xlim(-3,4)
plt.ylim(-3,4)
Out[23]:
In [18]:
X.columns
Out[18]:
In [19]:
# Lasso method
from sklearn.linear_model import Lasso
Lassoreg = Lasso(alpha = 1e-4,normalize = True,random_state = 42)
Lassoreg.fit(X,y)
score_r2 = Lassoreg.score(X_val,y_val)
print score_r2
Ls_features = sorted(zip(map(lambda x:round(x,4),Lassoreg.coef_),X.columns))
print Ls_features
In [20]:
y_val_rf = rf.predict(X_val)
y_val_Ls = Lassoreg.predict(X_val)
y_val_pred = (y_val_rf+y_val_Ls)/2
from sklearn.metrics import r2_score
print r2_score(y_val,y_val_pred)
In [21]:
import matplotlib.pyplot as plt;
imps,feas = zip(*(Ls_features[0:4]+Ls_features[-4:]))
ypos = np.arange(len(feas))
plt.barh(ypos,imps,align = 'center',alpha = 0.5)
plt.yticks(ypos,feas)
plt.xlabel('Feature Importance (Coefficient)')
Out[21]:
In [24]:
plt.subplot(1,2,1)
plt.plot(y_train,Lassoreg.predict(X_train),'o')
plt.xlabel('Training_y')
plt.ylabel('Predict_y')
plt.xlim(-6,6)
plt.ylim(-6,6)
plt.subplot(1,2,2)
plt.plot(y_val,Lassoreg.predict(X_val),'o')
plt.xlabel('val_y')
plt.ylabel('Predict_y')
plt.xlim(-3,4)
plt.ylim(-3,4)
Out[24]:
In [ ]: