In [33]:
%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
matplotlib.rcParams.update(params)

Data from the "IMDB5000" database


In [2]:
imdb_dat = pd.read_csv("movie_metadata.csv")
imdb_dat.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-null object
num_user_for_reviews         5022 non-null float64
language                     5031 non-null object
country                      5038 non-null object
content_rating               4740 non-null object
budget                       4551 non-null float64
title_year                   4935 non-null float64
actor_2_facebook_likes       5030 non-null float64
imdb_score                   5043 non-null float64
aspect_ratio                 4714 non-null float64
movie_facebook_likes         5043 non-null int64
dtypes: float64(13), int64(3), object(12)
memory usage: 1.1+ MB

Scrape data from DOUBAN.COM


In [3]:
import requests
import re
from bs4 import BeautifulSoup
import time
import string
# return the douban movie rating that matches the movie name and year
# read in the movie name
def doubanRating(name):
    movie_name = name.decode('gbk').encode('utf-8')
    url_head = 'http://movie.douban.com/subject_search'
    pageload = {'search_text': movie_name}
    r = requests.get(url_head,params = pageload)
    soup = BeautifulSoup(r.text,'html.parser')
    first_hit = soup.find_all(class_= 'nbg')
    try:
        r2_link = first_hit[0].get('href')
        # sometime douban returns items like celebrity instead of movies    
        if 'subject' not in r2_link:
            r2_link = first_hit[1].get('href')
        r2 = requests.get(r2_link)
        soup2 = BeautifulSoup(r2.text,'html.parser')
        title = soup2.find(property = "v:itemreviewed")
        title = title.get_text() # in unicode
        # remove Chinese characters
        title = ' '.join((title.split(' '))[1:])
        title = filter(lambda x:x in set(string.printable),title)
        flag = True
        if title != name:
            print "Warning: name may not match"
            flag = False
        year = (soup2.find(class_='year')).get_text()# in unicode
        rating = (soup2.find(class_="ll rating_num")).get_text() # in unicode
        num_review = (soup2.find(property="v:votes")).get_text()
        return [title, year, rating,num_review,flag]
    except:
        print "Record not found for: "+name
        return [name, None, None, None, None]

In [4]:
#%%2. Store scrapped data    
dataset = pd.read_csv("movie_metadata.csv")
total_length = 5043
#first_query = 2500
res = pd.DataFrame(columns = ('movie_title','year','rating','num_review','flag'))
for i in xrange(1,total_length):
    name = dataset['movie_title'][i].strip().strip('\xc2\xa0')
    res.loc[i] = doubanRating(name)
    print "slowly and finally done %d query"%i
    time.sleep(10)
    if (i%50==0):
        res.to_csv("douban_movie_review.csv")
        print "saved until record: %d"%i


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-1cd75b30270a> in <module>()
      6 for i in xrange(1,total_length):
      7     name = dataset['movie_title'][i].strip().strip('\xc2\xa0')
----> 8     res.loc[i] = doubanRating(name)
      9     print "slowly and finally done %d query"%i
     10     time.sleep(10)

NameError: name 'doubanRating' is not defined

1. Preliminary data visualization and analysis


In [3]:
douban_dat = pd.read_csv("douban_movie_review.csv")
douban_dat.rename(columns = {'movie_title':'d_movie_title','year':'d_year','rating':'douban_score','num_review':'dnum_review','flag':'dflag'},inplace = True)
douban_dat.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 6 columns):
Unnamed: 0       5043 non-null int64
d_movie_title    4975 non-null object
d_year           4447 non-null float64
douban_score     4447 non-null float64
dnum_review      4447 non-null float64
dflag            4447 non-null object
dtypes: float64(3), int64(1), object(2)
memory usage: 236.5+ KB

In [4]:
res_dat = pd.concat([imdb_dat,douban_dat],axis = 1)
res_dat.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 34 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-null object
num_user_for_reviews         5022 non-null float64
language                     5031 non-null object
country                      5038 non-null object
content_rating               4740 non-null object
budget                       4551 non-null float64
title_year                   4935 non-null float64
actor_2_facebook_likes       5030 non-null float64
imdb_score                   5043 non-null float64
aspect_ratio                 4714 non-null float64
movie_facebook_likes         5043 non-null int64
Unnamed: 0                   5043 non-null int64
d_movie_title                4975 non-null object
d_year                       4447 non-null float64
douban_score                 4447 non-null float64
dnum_review                  4447 non-null float64
dflag                        4447 non-null object
dtypes: float64(16), int64(4), object(14)
memory usage: 1.3+ MB

1.1 Visulze the gross distribution of rating from IMDB (x-axis) and Douban (y-axis)


In [34]:
# 1. visulize the gross distribution of ratings from imdb(x-axis) and douban(y-axis)
import seaborn as sns
g = sns.jointplot(x = 'imdb_score',y = 'douban_score',data = res_dat)
g.ax_joint.set(xlim=(1, 10), ylim=(1, 10))


Out[34]:
[(1, 10), (1, 10)]

1.2 Is it necessary to recenter(scale) IMDB score and Douban score?


In [35]:
# plot distribution and bar graphs(significantly different)
from scipy import stats
nbins = 15
fig,axes = plt.subplots(nrows = 1,ncols = 2, figsize = (10,8))
ax0,ax1 = axes.flatten()
ax0.hist([res_dat.douban_score,res_dat.imdb_score],nbins, histtype = 'bar',label = ["Douban","IMDB"])
ax0.set_title('The distribution of movie ratings')
ax0.set_xlabel('Rating')
ax0.set_ylabel('Count')
ax0.legend()
imdb_score = np.mean(res_dat.imdb_score)
douban_score = np.mean(res_dat.douban_score)
ax1.bar([0,1],[imdb_score, douban_score], yerr = [np.std(res_dat.imdb_score),np.std(res_dat.douban_score)], 
        align = 'center',color = ['green','blue'], ecolor = 'black')
ax1.set_xticks([0,1])
ax1.set_xticklabels(['IMDB','Douban'])
ax1.set_ylabel('Score')
_,p = stats.ttest_rel(res_dat['imdb_score'], res_dat['douban_score'],nan_policy = 'omit')
ax1.set_title('A comparison of ratings\n'+'t-test: p = %.4f***'%p)
#fig.tight_layout()
plt.show()
# any significant differences


1.3 Normalize IMDB and Douban rating scores


In [81]:
from sklearn import preprocessing
data = res_dat.dropna()
print " delete null values, the remaining data is",data.shape


 delete null values, the remaining data is (3468, 34)

In [82]:
data.loc[:,'scaled_imdb'] = preprocessing.scale(data['imdb_score']) 
data.loc[:,'scaled_douban'] = preprocessing.scale(data['douban_score'])

In [85]:
#stats.ttest_rel(data['scaled_imdb'], data['scaled_douban'],nan_policy = 'omit')
from scipy.stats import norm, lognorm
import matplotlib.mlab as mlab
fig,axes = plt.subplots(nrows = 1,ncols = 2, figsize = (10,8))
ax0,ax1 = axes.flatten()
ax0.plot(data['scaled_imdb'],data['scaled_douban'],'ro')
ax0.set_title('Normalized Scores')
ax0.set_xlabel('Scaled IMDB score')
ax0.set_ylabel('Scaled Douban score')
data.loc[:,'rating_diff'] = data['scaled_imdb'] - data['scaled_douban']
(mu,sigma) = norm.fit(data['rating_diff'])
_,bins,_ = ax1.hist(data['rating_diff'],60,normed = 1, histtype = 'bar',alpha = 0.75)
ax1.plot(bins, mlab.normpdf(bins,mu,sigma),'r--',linewidth = 2)
ax1.set_xlabel('IMDB_score - Douban_score')
ax1.set_ylabel('percentage')
ax1.set_title('Rating difference Distribution')
fig.tight_layout()
plt.show()


1.4 Visulze Features


In [87]:
data.describe()


Out[87]:
num_critic_for_reviews duration director_facebook_likes actor_3_facebook_likes actor_1_facebook_likes gross num_voted_users cast_total_facebook_likes facenumber_in_poster num_user_for_reviews ... imdb_score aspect_ratio movie_facebook_likes Unnamed: 0 d_year douban_score dnum_review scaled_imdb scaled_douban rating_diff
count 3468.000000 3468.000000 3468.000000 3468.000000 3468.000000 3.468000e+03 3.468000e+03 3468.000000 3468.000000 3468.000000 ... 3468.000000 3468.000000 3468.000000 3468.000000 3468.000000 3468.000000 3468.000000 3.468000e+03 3.468000e+03 3.468000e+03
mean 172.115629 110.626009 833.070934 790.064014 8068.584487 5.501582e+07 1.104752e+05 11969.002884 1.369666 347.628893 ... 6.487716 2.114175 9749.908304 2074.490484 -2003.390138 7.076701 32749.026817 -1.243839e-16 -2.091752e-16 1.836926e-16
std 123.124784 22.718718 3131.479201 1927.206403 15960.699190 7.171113e+07 1.552261e+05 19597.847902 2.000439 417.836395 ... 1.047347 0.270469 21785.161332 1329.638038 10.442049 0.928766 65573.107152 1.000144e+00 1.000144e+00 7.798167e-01
min 2.000000 37.000000 0.000000 0.000000 0.000000 1.620000e+02 5.750000e+02 0.000000 0.000000 5.000000 ... 1.600000 1.180000 0.000000 0.000000 -2017.000000 3.000000 27.000000 -4.667432e+00 -4.390006e+00 -4.889640e+00
25% 82.000000 96.000000 11.000000 208.750000 780.000000 1.021051e+07 2.245500e+04 2016.000000 0.000000 117.750000 ... 5.900000 1.850000 0.000000 933.750000 -2010.000000 6.400000 1231.750000 -5.612285e-01 -7.287074e-01 -3.973756e-01
50% 143.000000 106.000000 66.000000 447.000000 2000.000000 3.204881e+07 5.731250e+04 4319.500000 1.000000 216.000000 ... 6.600000 2.350000 258.000000 1943.500000 -2005.000000 7.100000 6790.500000 1.072232e-01 2.508930e-02 4.760155e-02
75% 227.000000 120.000000 248.000000 701.000000 13000.000000 7.017879e+07 1.346860e+05 16729.250000 2.000000 411.250000 ... 7.200000 2.350000 12000.000000 3080.250000 -1999.000000 7.700000 30995.750000 6.801818e-01 6.712008e-01 4.646173e-01
max 813.000000 330.000000 23000.000000 23000.000000 640000.000000 7.605058e+08 1.689764e+06 656730.000000 43.000000 5060.000000 ... 9.300000 2.760000 349000.000000 5042.000000 -1921.000000 9.600000 781648.000000 2.685537e+00 2.717220e+00 4.090872e+00

8 rows × 23 columns


In [88]:
data.describe(include = ['object'])


Out[88]:
color director_name actor_2_name genres actor_1_name movie_title actor_3_name plot_keywords movie_imdb_link language country content_rating d_movie_title dflag
count 3468 3468 3468 3468 3468 3468 3468 3468 3468 3468 3468 3468 3468 3468
unique 2 1533 2028 709 1310 3371 2403 3371 3371 31 43 12 3294 3
top Color Steven Spielberg Morgan Freeman Comedy|Drama|Romance Robert De Niro Home Steve Coogan eighteen wheeler|illegal street racing|truck|t... http://www.imdb.com/title/tt1976009/?ref_=fn_t... English USA R Mad Max: Fury Road TRUE
freq 3360 25 20 133 42 3 8 3 3 3350 2786 1544 5 3080

In [98]:
ind = data['rating_diff'].argmax()
print data.iloc[ind].movie_title
print data.iloc[ind].scaled_imdb
print data.iloc[ind].scaled_douban
print data.iloc[ind].title_year
print data.iloc[ind].movie_imdb_link
print data.iloc[ind].d_year
print data.iloc[ind].douban_score
print data.iloc[ind].imdb_score


The Scorch Trials 
-0.0837629805358
-1.48250415851
2015.0
http://www.imdb.com/title/tt4046784/?ref_=fn_tt_tt_1
-2015.0
5.7

In [93]:
data.columns


Out[93]:
Index([u'color', u'director_name', u'num_critic_for_reviews', u'duration',
       u'director_facebook_likes', u'actor_3_facebook_likes', u'actor_2_name',
       u'actor_1_facebook_likes', u'gross', u'genres', u'actor_1_name',
       u'movie_title', u'num_voted_users', u'cast_total_facebook_likes',
       u'actor_3_name', u'facenumber_in_poster', u'plot_keywords',
       u'movie_imdb_link', u'num_user_for_reviews', u'language', u'country',
       u'content_rating', u'budget', u'title_year', u'actor_2_facebook_likes',
       u'imdb_score', u'aspect_ratio', u'movie_facebook_likes', u'Unnamed: 0',
       u'd_movie_title', u'd_year', u'douban_score', u'dnum_review', u'dflag',
       u'scaled_imdb', u'scaled_douban', u'rating_diff'],
      dtype='object')

In [8]:
# 2. Predict differences in ratings
res_dat['diff_rating'] = res_dat['douban_score']-res_dat['imdb_score'] 
# 2.1. covert categorical variable Genre to Dummy variables
# only extract the first genre out of the list to simplify the problem
res_dat['genre1'] = res_dat.apply(lambda row:(row['genres'].split('|'))[0],axis = 1)
#res_dat['genre1'].value_counts()
# Because there are 21 genres, here we only choose the top 7 to convert to index
top_genre = ['Comedy','Action','Drama','Adventure','Crime','Biography','Horror']
# The rest of genre types we just consider them as others
res_dat['top_genre'] = res_dat.apply(lambda row:row['genre1'] if row['genre1'] in top_genre else 'Other',axis =1)
#select num_user_for_reviews ,director_facebook_likes ,actor_1_facebook_likes  ,gross , genres,
#budget,# dnum_review # for EDA
res_subdat = res_dat[['top_genre','num_user_for_reviews','director_facebook_likes','actor_1_facebook_likes','gross','budget','dnum_review','diff_rating']]
res_subdat = pd.get_dummies(res_subdat,prefix =['top_genre'])
#res_dat = pd.get_dummies(res_dat,prefix = ['top_genre'])
res_subdat.shape


Out[8]:
(5043, 15)

In [9]:
# create a subset for visualization and preliminary analysis
col2 = [u'num_user_for_reviews', u'director_facebook_likes',
       u'actor_1_facebook_likes', u'gross', u'budget', u'dnum_review', u'top_genre_Action', u'top_genre_Adventure',
       u'top_genre_Biography', u'top_genre_Comedy', u'top_genre_Crime',
       u'top_genre_Drama', u'top_genre_Horror', u'top_genre_Other',u'diff_rating']
res_subdat = res_subdat[col2]

In [10]:
# a subset for plotting correlation
col_cat = [u'gross', u'budget', u'dnum_review',u'num_user_for_reviews',u'top_genre_Action', u'top_genre_Adventure',
       u'top_genre_Biography', u'top_genre_Comedy', u'top_genre_Crime',
       u'top_genre_Drama', u'top_genre_Horror', u'diff_rating']
res_subdat_genre = res_subdat[col_cat]

In [11]:
# show pair-wise correlation between differences in ratings and estimators
import matplotlib.pylab as plt
import numpy as np
corr = res_subdat_genre.corr()
sns.set(style = "white")
f,ax = plt.subplots(figsize=(11,9))
cmap = sns.diverging_palette(220,10,as_cmap=True)
mask = np.zeros_like(corr,dtype = np.bool)
sns.heatmap(corr,mask = mask,cmap = cmap, vmax=.3,square = True, linewidths = .5,
            cbar_kws = {"shrink": .5},ax = ax)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x10cc2e050>

In [12]:
# prepare trainning set and target set
col_train = col2[:len(col2)-1]
col_target = col2[len(col2)-1]
#cl_res_subdat = res_subdat.dropna(axis =0)

In [13]:
cl_res_subdat.shaperating_diff


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-6e97cd50e0d0> in <module>()
----> 1 cl_res_subdat.shape

NameError: name 'cl_res_subdat' is not defined

In [14]:
# 2.2 Use Random Forest Regressor for prediction
X_cat = res_subdat.ix[:,'top_genre_Action':'top_genre_Other']
num_col = []
for i in res_dat.columns:
    if res_dat[i].dtype != 'object':
        num_col.append(i)
X_num = res_dat[num_col]
X = pd.concat([X_cat,X_num],axis = 1)
X = X.dropna(axis = 0)
y = X['diff_rating']
X = X.iloc[:,:-1]
X.drop(['imdb_score','douban_score'],axis = 1,inplace = True)
from sklearn.model_selection import train_test_split
# METHOD 1: BUILD randomforestregressor
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.1,random_state = 42)

In [15]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 500)
forest = rf.fit(X_train, y_train)
score_r2 = rf.score(X_val,y_val)
# print: R-sqr
print score_r2


0.334270627895

In [16]:
rf_features = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), X.columns),reverse = True)

In [17]:
import matplotlib.pyplot as plt;
imps,feas = zip(*(rf_features[0:4]+rf_features[6:12]))
ypos = np.arange(len(feas))
plt.barh(ypos,imps,align = 'center',alpha = 0.5)
plt.yticks(ypos,feas)
plt.xlabel('Feature Importance')


Out[17]:
<matplotlib.text.Text at 0x11efeef90>

In [23]:
plt.subplot(1,2,1)
plt.plot(y_train,rf.predict(X_train),'o')
plt.xlabel('Training_y')
plt.ylabel('Predict_y')
plt.xlim(-6,6)
plt.ylim(-6,6)
plt.subplot(1,2,2)
plt.plot(y_val,rf.predict(X_val),'o')
plt.xlabel('val_y')
plt.ylabel('Predict_y')
plt.xlim(-3,4)
plt.ylim(-3,4)


Out[23]:
(-3, 4)

In [18]:
X.columns


Out[18]:
Index([u'top_genre_Action', u'top_genre_Adventure', u'top_genre_Biography',
       u'top_genre_Comedy', u'top_genre_Crime', u'top_genre_Drama',
       u'top_genre_Horror', u'top_genre_Other', u'num_critic_for_reviews',
       u'duration', u'director_facebook_likes', u'actor_3_facebook_likes',
       u'actor_1_facebook_likes', u'gross', u'num_voted_users',
       u'cast_total_facebook_likes', u'facenumber_in_poster',
       u'num_user_for_reviews', u'budget', u'title_year',
       u'actor_2_facebook_likes', u'aspect_ratio', u'movie_facebook_likes',
       u'Unnamed: 0', u'd_year', u'dnum_review'],
      dtype='object')

In [19]:
# Lasso method
from sklearn.linear_model import Lasso
Lassoreg = Lasso(alpha = 1e-4,normalize = True,random_state = 42)
Lassoreg.fit(X,y)
score_r2 = Lassoreg.score(X_val,y_val)
print score_r2
Ls_features = sorted(zip(map(lambda x:round(x,4),Lassoreg.coef_),X.columns))
print Ls_features


0.2254223453
[(-0.2787, 'aspect_ratio'), (-0.1764, 'top_genre_Crime'), (-0.069, 'top_genre_Action'), (-0.0088, 'top_genre_Drama'), (-0.0027, 'duration'), (-0.0024, 'num_critic_for_reviews'), (-0.0001, 'Unnamed: 0'), (-0.0, 'actor_1_facebook_likes'), (0.0, 'actor_2_facebook_likes'), (-0.0, 'actor_3_facebook_likes'), (-0.0, 'budget'), (-0.0, 'cast_total_facebook_likes'), (0.0, 'director_facebook_likes'), (0.0, 'dnum_review'), (0.0, 'facenumber_in_poster'), (0.0, 'gross'), (0.0, 'movie_facebook_likes'), (-0.0, 'num_voted_users'), (0.0, 'top_genre_Adventure'), (0.0, 'top_genre_Biography'), (0.0002, 'num_user_for_reviews'), (0.0047, 'top_genre_Horror'), (0.0057, 'top_genre_Comedy'), (0.0159, 'd_year'), (0.018, 'title_year'), (0.0314, 'top_genre_Other')]

In [20]:
y_val_rf = rf.predict(X_val)
y_val_Ls = Lassoreg.predict(X_val)
y_val_pred = (y_val_rf+y_val_Ls)/2
from sklearn.metrics import r2_score
print r2_score(y_val,y_val_pred)


0.311650990644

In [21]:
import matplotlib.pyplot as plt;
imps,feas = zip(*(Ls_features[0:4]+Ls_features[-4:]))
ypos = np.arange(len(feas))
plt.barh(ypos,imps,align = 'center',alpha = 0.5)
plt.yticks(ypos,feas)
plt.xlabel('Feature Importance (Coefficient)')


Out[21]:
<matplotlib.text.Text at 0x11f4175d0>

In [24]:
plt.subplot(1,2,1)
plt.plot(y_train,Lassoreg.predict(X_train),'o')
plt.xlabel('Training_y')
plt.ylabel('Predict_y')
plt.xlim(-6,6)
plt.ylim(-6,6)
plt.subplot(1,2,2)
plt.plot(y_val,Lassoreg.predict(X_val),'o')
plt.xlabel('val_y')
plt.ylabel('Predict_y')
plt.xlim(-3,4)
plt.ylim(-3,4)


Out[24]:
(-3, 4)

In [ ]: