In [33]:

    
%matplotlib inline
import os
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
matplotlib.rcParams.update(params)

Data from the "IMDB5000" database



In [2]:

    
imdb_dat = pd.read_csv("movie_metadata.csv")
imdb_dat.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 28 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-null object
num_user_for_reviews         5022 non-null float64
language                     5031 non-null object
country                      5038 non-null object
content_rating               4740 non-null object
budget                       4551 non-null float64
title_year                   4935 non-null float64
actor_2_facebook_likes       5030 non-null float64
imdb_score                   5043 non-null float64
aspect_ratio                 4714 non-null float64
movie_facebook_likes         5043 non-null int64
dtypes: float64(13), int64(3), object(12)
memory usage: 1.1+ MB

Scrape data from DOUBAN.COM



In [3]:

    
import requests
import re
from bs4 import BeautifulSoup
import time
import string
# return the douban movie rating that matches the movie name and year
# read in the movie name
def doubanRating(name):
    movie_name = name.decode('gbk').encode('utf-8')
    url_head = 'http://movie.douban.com/subject_search'
    pageload = {'search_text': movie_name}
    r = requests.get(url_head,params = pageload)
    soup = BeautifulSoup(r.text,'html.parser')
    first_hit = soup.find_all(class_= 'nbg')
    try:
        r2_link = first_hit[0].get('href')
        # sometime douban returns items like celebrity instead of movies    
        if 'subject' not in r2_link:
            r2_link = first_hit[1].get('href')
        r2 = requests.get(r2_link)
        soup2 = BeautifulSoup(r2.text,'html.parser')
        title = soup2.find(property = "v:itemreviewed")
        title = title.get_text() # in unicode
        # remove Chinese characters
        title = ' '.join((title.split(' '))[1:])
        title = filter(lambda x:x in set(string.printable),title)
        flag = True
        if title != name:
            print "Warning: name may not match"
            flag = False
        year = (soup2.find(class_='year')).get_text()# in unicode
        rating = (soup2.find(class_="ll rating_num")).get_text() # in unicode
        num_review = (soup2.find(property="v:votes")).get_text()
        return [title, year, rating,num_review,flag]
    except:
        print "Record not found for: "+name
        return [name, None, None, None, None]



In [4]:

    
#%%2. Store scrapped data    
dataset = pd.read_csv("movie_metadata.csv")
total_length = 5043
#first_query = 2500
res = pd.DataFrame(columns = ('movie_title','year','rating','num_review','flag'))
for i in xrange(1,total_length):
    name = dataset['movie_title'][i].strip().strip('\xc2\xa0')
    res.loc[i] = doubanRating(name)
    print "slowly and finally done %d query"%i
    time.sleep(10)
    if (i%50==0):
        res.to_csv("douban_movie_review.csv")
        print "saved until record: %d"%i









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-4-1cd75b30270a> in <module>()
      6 for i in xrange(1,total_length):
      7     name = dataset['movie_title'][i].strip().strip('\xc2\xa0')
----> 8     res.loc[i] = doubanRating(name)
      9     print "slowly and finally done %d query"%i
     10     time.sleep(10)

NameError: name 'doubanRating' is not defined

1. Preliminary data visualization and analysis



In [3]:

    
douban_dat = pd.read_csv("douban_movie_review.csv")
douban_dat.rename(columns = {'movie_title':'d_movie_title','year':'d_year','rating':'douban_score','num_review':'dnum_review','flag':'dflag'},inplace = True)
douban_dat.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 6 columns):
Unnamed: 0       5043 non-null int64
d_movie_title    4975 non-null object
d_year           4447 non-null float64
douban_score     4447 non-null float64
dnum_review      4447 non-null float64
dflag            4447 non-null object
dtypes: float64(3), int64(1), object(2)
memory usage: 236.5+ KB



In [4]:

    
res_dat = pd.concat([imdb_dat,douban_dat],axis = 1)
res_dat.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 34 columns):
color                        5024 non-null object
director_name                4939 non-null object
num_critic_for_reviews       4993 non-null float64
duration                     5028 non-null float64
director_facebook_likes      4939 non-null float64
actor_3_facebook_likes       5020 non-null float64
actor_2_name                 5030 non-null object
actor_1_facebook_likes       5036 non-null float64
gross                        4159 non-null float64
genres                       5043 non-null object
actor_1_name                 5036 non-null object
movie_title                  5043 non-null object
num_voted_users              5043 non-null int64
cast_total_facebook_likes    5043 non-null int64
actor_3_name                 5020 non-null object
facenumber_in_poster         5030 non-null float64
plot_keywords                4890 non-null object
movie_imdb_link              5043 non-null object
num_user_for_reviews         5022 non-null float64
language                     5031 non-null object
country                      5038 non-null object
content_rating               4740 non-null object
budget                       4551 non-null float64
title_year                   4935 non-null float64
actor_2_facebook_likes       5030 non-null float64
imdb_score                   5043 non-null float64
aspect_ratio                 4714 non-null float64
movie_facebook_likes         5043 non-null int64
Unnamed: 0                   5043 non-null int64
d_movie_title                4975 non-null object
d_year                       4447 non-null float64
douban_score                 4447 non-null float64
dnum_review                  4447 non-null float64
dflag                        4447 non-null object
dtypes: float64(16), int64(4), object(14)
memory usage: 1.3+ MB

1.1 Visulze the gross distribution of rating from IMDB (x-axis) and Douban (y-axis)



In [34]:

    
# 1. visulize the gross distribution of ratings from imdb(x-axis) and douban(y-axis)
import seaborn as sns
g = sns.jointplot(x = 'imdb_score',y = 'douban_score',data = res_dat)
g.ax_joint.set(xlim=(1, 10), ylim=(1, 10))









    Out[34]:





[(1, 10), (1, 10)]

1.2 Is it necessary to recenter(scale) IMDB score and Douban score?



In [35]:

    
# plot distribution and bar graphs(significantly different)
from scipy import stats
nbins = 15
fig,axes = plt.subplots(nrows = 1,ncols = 2, figsize = (10,8))
ax0,ax1 = axes.flatten()
ax0.hist([res_dat.douban_score,res_dat.imdb_score],nbins, histtype = 'bar',label = ["Douban","IMDB"])
ax0.set_title('The distribution of movie ratings')
ax0.set_xlabel('Rating')
ax0.set_ylabel('Count')
ax0.legend()
imdb_score = np.mean(res_dat.imdb_score)
douban_score = np.mean(res_dat.douban_score)
ax1.bar([0,1],[imdb_score, douban_score], yerr = [np.std(res_dat.imdb_score),np.std(res_dat.douban_score)], 
        align = 'center',color = ['green','blue'], ecolor = 'black')
ax1.set_xticks([0,1])
ax1.set_xticklabels(['IMDB','Douban'])
ax1.set_ylabel('Score')
_,p = stats.ttest_rel(res_dat['imdb_score'], res_dat['douban_score'],nan_policy = 'omit')
ax1.set_title('A comparison of ratings\n'+'t-test: p = %.4f***'%p)
#fig.tight_layout()
plt.show()
# any significant differences

1.3 Normalize IMDB and Douban rating scores



In [81]:

    
from sklearn import preprocessing
data = res_dat.dropna()
print " delete null values, the remaining data is",data.shape









    



 delete null values, the remaining data is (3468, 34)



In [82]:

    
data.loc[:,'scaled_imdb'] = preprocessing.scale(data['imdb_score']) 
data.loc[:,'scaled_douban'] = preprocessing.scale(data['douban_score'])



In [85]:

    
#stats.ttest_rel(data['scaled_imdb'], data['scaled_douban'],nan_policy = 'omit')
from scipy.stats import norm, lognorm
import matplotlib.mlab as mlab
fig,axes = plt.subplots(nrows = 1,ncols = 2, figsize = (10,8))
ax0,ax1 = axes.flatten()
ax0.plot(data['scaled_imdb'],data['scaled_douban'],'ro')
ax0.set_title('Normalized Scores')
ax0.set_xlabel('Scaled IMDB score')
ax0.set_ylabel('Scaled Douban score')
data.loc[:,'rating_diff'] = data['scaled_imdb'] - data['scaled_douban']
(mu,sigma) = norm.fit(data['rating_diff'])
_,bins,_ = ax1.hist(data['rating_diff'],60,normed = 1, histtype = 'bar',alpha = 0.75)
ax1.plot(bins, mlab.normpdf(bins,mu,sigma),'r--',linewidth = 2)
ax1.set_xlabel('IMDB_score - Douban_score')
ax1.set_ylabel('percentage')
ax1.set_title('Rating difference Distribution')
fig.tight_layout()
plt.show()

1.4 Visulze Features



In [87]:

    
data.describe()









    Out[87]:






  
    
      
      num_critic_for_reviews
      duration
      director_facebook_likes
      actor_3_facebook_likes
      actor_1_facebook_likes
      gross
      num_voted_users
      cast_total_facebook_likes
      facenumber_in_poster
      num_user_for_reviews
      ...
      imdb_score
      aspect_ratio
      movie_facebook_likes
      Unnamed: 0
      d_year
      douban_score
      dnum_review
      scaled_imdb
      scaled_douban
      rating_diff
    
  
  
    
      count
      3468.000000
      3468.000000
      3468.000000
      3468.000000
      3468.000000
      3.468000e+03
      3.468000e+03
      3468.000000
      3468.000000
      3468.000000
      ...
      3468.000000
      3468.000000
      3468.000000
      3468.000000
      3468.000000
      3468.000000
      3468.000000
      3.468000e+03
      3.468000e+03
      3.468000e+03
    
    
      mean
      172.115629
      110.626009
      833.070934
      790.064014
      8068.584487
      5.501582e+07
      1.104752e+05
      11969.002884
      1.369666
      347.628893
      ...
      6.487716
      2.114175
      9749.908304
      2074.490484
      -2003.390138
      7.076701
      32749.026817
      -1.243839e-16
      -2.091752e-16
      1.836926e-16
    
    
      std
      123.124784
      22.718718
      3131.479201
      1927.206403
      15960.699190
      7.171113e+07
      1.552261e+05
      19597.847902
      2.000439
      417.836395
      ...
      1.047347
      0.270469
      21785.161332
      1329.638038
      10.442049
      0.928766
      65573.107152
      1.000144e+00
      1.000144e+00
      7.798167e-01
    
    
      min
      2.000000
      37.000000
      0.000000
      0.000000
      0.000000
      1.620000e+02
      5.750000e+02
      0.000000
      0.000000
      5.000000
      ...
      1.600000
      1.180000
      0.000000
      0.000000
      -2017.000000
      3.000000
      27.000000
      -4.667432e+00
      -4.390006e+00
      -4.889640e+00
    
    
      25%
      82.000000
      96.000000
      11.000000
      208.750000
      780.000000
      1.021051e+07
      2.245500e+04
      2016.000000
      0.000000
      117.750000
      ...
      5.900000
      1.850000
      0.000000
      933.750000
      -2010.000000
      6.400000
      1231.750000
      -5.612285e-01
      -7.287074e-01
      -3.973756e-01
    
    
      50%
      143.000000
      106.000000
      66.000000
      447.000000
      2000.000000
      3.204881e+07
      5.731250e+04
      4319.500000
      1.000000
      216.000000
      ...
      6.600000
      2.350000
      258.000000
      1943.500000
      -2005.000000
      7.100000
      6790.500000
      1.072232e-01
      2.508930e-02
      4.760155e-02
    
    
      75%
      227.000000
      120.000000
      248.000000
      701.000000
      13000.000000
      7.017879e+07
      1.346860e+05
      16729.250000
      2.000000
      411.250000
      ...
      7.200000
      2.350000
      12000.000000
      3080.250000
      -1999.000000
      7.700000
      30995.750000
      6.801818e-01
      6.712008e-01
      4.646173e-01
    
    
      max
      813.000000
      330.000000
      23000.000000
      23000.000000
      640000.000000
      7.605058e+08
      1.689764e+06
      656730.000000
      43.000000
      5060.000000
      ...
      9.300000
      2.760000
      349000.000000
      5042.000000
      -1921.000000
      9.600000
      781648.000000
      2.685537e+00
      2.717220e+00
      4.090872e+00
    
  

8 rows × 23 columns



In [88]:

    
data.describe(include = ['object'])









    Out[88]:






  
    
      
      color
      director_name
      actor_2_name
      genres
      actor_1_name
      movie_title
      actor_3_name
      plot_keywords
      movie_imdb_link
      language
      country
      content_rating
      d_movie_title
      dflag
    
  
  
    
      count
      3468
      3468
      3468
      3468
      3468
      3468
      3468
      3468
      3468
      3468
      3468
      3468
      3468
      3468
    
    
      unique
      2
      1533
      2028
      709
      1310
      3371
      2403
      3371
      3371
      31
      43
      12
      3294
      3
    
    
      top
      Color
      Steven Spielberg
      Morgan Freeman
      Comedy|Drama|Romance
      Robert De Niro
      Home
      Steve Coogan
      eighteen wheeler|illegal street racing|truck|t...
      http://www.imdb.com/title/tt1976009/?ref_=fn_t...
      English
      USA
      R
      Mad Max: Fury Road
      TRUE
    
    
      freq
      3360
      25
      20
      133
      42
      3
      8
      3
      3
      3350
      2786
      1544
      5
      3080



In [98]:

    
ind = data['rating_diff'].argmax()
print data.iloc[ind].movie_title
print data.iloc[ind].scaled_imdb
print data.iloc[ind].scaled_douban
print data.iloc[ind].title_year
print data.iloc[ind].movie_imdb_link
print data.iloc[ind].d_year
print data.iloc[ind].douban_score
print data.iloc[ind].imdb_score









    



The Scorch Trials 
-0.0837629805358
-1.48250415851
2015.0
http://www.imdb.com/title/tt4046784/?ref_=fn_tt_tt_1
-2015.0
5.7



In [93]:

    
data.columns









    Out[93]:





Index([u'color', u'director_name', u'num_critic_for_reviews', u'duration',
       u'director_facebook_likes', u'actor_3_facebook_likes', u'actor_2_name',
       u'actor_1_facebook_likes', u'gross', u'genres', u'actor_1_name',
       u'movie_title', u'num_voted_users', u'cast_total_facebook_likes',
       u'actor_3_name', u'facenumber_in_poster', u'plot_keywords',
       u'movie_imdb_link', u'num_user_for_reviews', u'language', u'country',
       u'content_rating', u'budget', u'title_year', u'actor_2_facebook_likes',
       u'imdb_score', u'aspect_ratio', u'movie_facebook_likes', u'Unnamed: 0',
       u'd_movie_title', u'd_year', u'douban_score', u'dnum_review', u'dflag',
       u'scaled_imdb', u'scaled_douban', u'rating_diff'],
      dtype='object')



In [8]:

    
# 2. Predict differences in ratings
res_dat['diff_rating'] = res_dat['douban_score']-res_dat['imdb_score'] 
# 2.1. covert categorical variable Genre to Dummy variables
# only extract the first genre out of the list to simplify the problem
res_dat['genre1'] = res_dat.apply(lambda row:(row['genres'].split('|'))[0],axis = 1)
#res_dat['genre1'].value_counts()
# Because there are 21 genres, here we only choose the top 7 to convert to index
top_genre = ['Comedy','Action','Drama','Adventure','Crime','Biography','Horror']
# The rest of genre types we just consider them as others
res_dat['top_genre'] = res_dat.apply(lambda row:row['genre1'] if row['genre1'] in top_genre else 'Other',axis =1)
#select num_user_for_reviews ,director_facebook_likes ,actor_1_facebook_likes  ,gross , genres,
#budget,# dnum_review # for EDA
res_subdat = res_dat[['top_genre','num_user_for_reviews','director_facebook_likes','actor_1_facebook_likes','gross','budget','dnum_review','diff_rating']]
res_subdat = pd.get_dummies(res_subdat,prefix =['top_genre'])
#res_dat = pd.get_dummies(res_dat,prefix = ['top_genre'])
res_subdat.shape









    Out[8]:





(5043, 15)



In [9]:

    
# create a subset for visualization and preliminary analysis
col2 = [u'num_user_for_reviews', u'director_facebook_likes',
       u'actor_1_facebook_likes', u'gross', u'budget', u'dnum_review', u'top_genre_Action', u'top_genre_Adventure',
       u'top_genre_Biography', u'top_genre_Comedy', u'top_genre_Crime',
       u'top_genre_Drama', u'top_genre_Horror', u'top_genre_Other',u'diff_rating']
res_subdat = res_subdat[col2]



In [10]:

    
# a subset for plotting correlation
col_cat = [u'gross', u'budget', u'dnum_review',u'num_user_for_reviews',u'top_genre_Action', u'top_genre_Adventure',
       u'top_genre_Biography', u'top_genre_Comedy', u'top_genre_Crime',
       u'top_genre_Drama', u'top_genre_Horror', u'diff_rating']
res_subdat_genre = res_subdat[col_cat]



In [11]:

    
# show pair-wise correlation between differences in ratings and estimators
import matplotlib.pylab as plt
import numpy as np
corr = res_subdat_genre.corr()
sns.set(style = "white")
f,ax = plt.subplots(figsize=(11,9))
cmap = sns.diverging_palette(220,10,as_cmap=True)
mask = np.zeros_like(corr,dtype = np.bool)
sns.heatmap(corr,mask = mask,cmap = cmap, vmax=.3,square = True, linewidths = .5,
            cbar_kws = {"shrink": .5},ax = ax)









    Out[11]:





<matplotlib.axes._subplots.AxesSubplot at 0x10cc2e050>



In [12]:

    
# prepare trainning set and target set
col_train = col2[:len(col2)-1]
col_target = col2[len(col2)-1]
#cl_res_subdat = res_subdat.dropna(axis =0)



In [13]:

    
cl_res_subdat.shaperating_diff









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-6e97cd50e0d0> in <module>()
----> 1 cl_res_subdat.shape

NameError: name 'cl_res_subdat' is not defined



In [14]:

    
# 2.2 Use Random Forest Regressor for prediction
X_cat = res_subdat.ix[:,'top_genre_Action':'top_genre_Other']
num_col = []
for i in res_dat.columns:
    if res_dat[i].dtype != 'object':
        num_col.append(i)
X_num = res_dat[num_col]
X = pd.concat([X_cat,X_num],axis = 1)
X = X.dropna(axis = 0)
y = X['diff_rating']
X = X.iloc[:,:-1]
X.drop(['imdb_score','douban_score'],axis = 1,inplace = True)
from sklearn.model_selection import train_test_split
# METHOD 1: BUILD randomforestregressor
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.1,random_state = 42)



In [15]:

    
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 500)
forest = rf.fit(X_train, y_train)
score_r2 = rf.score(X_val,y_val)
# print: R-sqr
print score_r2









    



0.334270627895



In [16]:

    
rf_features = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), X.columns),reverse = True)



In [17]:

    
import matplotlib.pyplot as plt;
imps,feas = zip(*(rf_features[0:4]+rf_features[6:12]))
ypos = np.arange(len(feas))
plt.barh(ypos,imps,align = 'center',alpha = 0.5)
plt.yticks(ypos,feas)
plt.xlabel('Feature Importance')









    Out[17]:





<matplotlib.text.Text at 0x11efeef90>



In [23]:

    
plt.subplot(1,2,1)
plt.plot(y_train,rf.predict(X_train),'o')
plt.xlabel('Training_y')
plt.ylabel('Predict_y')
plt.xlim(-6,6)
plt.ylim(-6,6)
plt.subplot(1,2,2)
plt.plot(y_val,rf.predict(X_val),'o')
plt.xlabel('val_y')
plt.ylabel('Predict_y')
plt.xlim(-3,4)
plt.ylim(-3,4)









    Out[23]:





(-3, 4)



In [18]:

    
X.columns









    Out[18]:





Index([u'top_genre_Action', u'top_genre_Adventure', u'top_genre_Biography',
       u'top_genre_Comedy', u'top_genre_Crime', u'top_genre_Drama',
       u'top_genre_Horror', u'top_genre_Other', u'num_critic_for_reviews',
       u'duration', u'director_facebook_likes', u'actor_3_facebook_likes',
       u'actor_1_facebook_likes', u'gross', u'num_voted_users',
       u'cast_total_facebook_likes', u'facenumber_in_poster',
       u'num_user_for_reviews', u'budget', u'title_year',
       u'actor_2_facebook_likes', u'aspect_ratio', u'movie_facebook_likes',
       u'Unnamed: 0', u'd_year', u'dnum_review'],
      dtype='object')



In [19]:

    
# Lasso method
from sklearn.linear_model import Lasso
Lassoreg = Lasso(alpha = 1e-4,normalize = True,random_state = 42)
Lassoreg.fit(X,y)
score_r2 = Lassoreg.score(X_val,y_val)
print score_r2
Ls_features = sorted(zip(map(lambda x:round(x,4),Lassoreg.coef_),X.columns))
print Ls_features









    



0.2254223453
[(-0.2787, 'aspect_ratio'), (-0.1764, 'top_genre_Crime'), (-0.069, 'top_genre_Action'), (-0.0088, 'top_genre_Drama'), (-0.0027, 'duration'), (-0.0024, 'num_critic_for_reviews'), (-0.0001, 'Unnamed: 0'), (-0.0, 'actor_1_facebook_likes'), (0.0, 'actor_2_facebook_likes'), (-0.0, 'actor_3_facebook_likes'), (-0.0, 'budget'), (-0.0, 'cast_total_facebook_likes'), (0.0, 'director_facebook_likes'), (0.0, 'dnum_review'), (0.0, 'facenumber_in_poster'), (0.0, 'gross'), (0.0, 'movie_facebook_likes'), (-0.0, 'num_voted_users'), (0.0, 'top_genre_Adventure'), (0.0, 'top_genre_Biography'), (0.0002, 'num_user_for_reviews'), (0.0047, 'top_genre_Horror'), (0.0057, 'top_genre_Comedy'), (0.0159, 'd_year'), (0.018, 'title_year'), (0.0314, 'top_genre_Other')]



In [20]:

    
y_val_rf = rf.predict(X_val)
y_val_Ls = Lassoreg.predict(X_val)
y_val_pred = (y_val_rf+y_val_Ls)/2
from sklearn.metrics import r2_score
print r2_score(y_val,y_val_pred)









    



0.311650990644



In [21]:

    
import matplotlib.pyplot as plt;
imps,feas = zip(*(Ls_features[0:4]+Ls_features[-4:]))
ypos = np.arange(len(feas))
plt.barh(ypos,imps,align = 'center',alpha = 0.5)
plt.yticks(ypos,feas)
plt.xlabel('Feature Importance (Coefficient)')









    Out[21]:





<matplotlib.text.Text at 0x11f4175d0>



In [24]:

    
plt.subplot(1,2,1)
plt.plot(y_train,Lassoreg.predict(X_train),'o')
plt.xlabel('Training_y')
plt.ylabel('Predict_y')
plt.xlim(-6,6)
plt.ylim(-6,6)
plt.subplot(1,2,2)
plt.plot(y_val,Lassoreg.predict(X_val),'o')
plt.xlabel('val_y')
plt.ylabel('Predict_y')
plt.xlim(-3,4)
plt.ylim(-3,4)









    Out[24]:





(-3, 4)



In [ ]:

	num_critic_for_reviews	duration	director_facebook_likes	actor_3_facebook_likes	actor_1_facebook_likes	gross	num_voted_users	cast_total_facebook_likes	facenumber_in_poster	num_user_for_reviews	...	imdb_score	aspect_ratio	movie_facebook_likes	Unnamed: 0	d_year	douban_score	dnum_review	scaled_imdb	scaled_douban	rating_diff
count	3468.000000	3468.000000	3468.000000	3468.000000	3468.000000	3.468000e+03	3.468000e+03	3468.000000	3468.000000	3468.000000	...	3468.000000	3468.000000	3468.000000	3468.000000	3468.000000	3468.000000	3468.000000	3.468000e+03	3.468000e+03	3.468000e+03
mean	172.115629	110.626009	833.070934	790.064014	8068.584487	5.501582e+07	1.104752e+05	11969.002884	1.369666	347.628893	...	6.487716	2.114175	9749.908304	2074.490484	-2003.390138	7.076701	32749.026817	-1.243839e-16	-2.091752e-16	1.836926e-16
std	123.124784	22.718718	3131.479201	1927.206403	15960.699190	7.171113e+07	1.552261e+05	19597.847902	2.000439	417.836395	...	1.047347	0.270469	21785.161332	1329.638038	10.442049	0.928766	65573.107152	1.000144e+00	1.000144e+00	7.798167e-01
min	2.000000	37.000000	0.000000	0.000000	0.000000	1.620000e+02	5.750000e+02	0.000000	0.000000	5.000000	...	1.600000	1.180000	0.000000	0.000000	-2017.000000	3.000000	27.000000	-4.667432e+00	-4.390006e+00	-4.889640e+00
25%	82.000000	96.000000	11.000000	208.750000	780.000000	1.021051e+07	2.245500e+04	2016.000000	0.000000	117.750000	...	5.900000	1.850000	0.000000	933.750000	-2010.000000	6.400000	1231.750000	-5.612285e-01	-7.287074e-01	-3.973756e-01
50%	143.000000	106.000000	66.000000	447.000000	2000.000000	3.204881e+07	5.731250e+04	4319.500000	1.000000	216.000000	...	6.600000	2.350000	258.000000	1943.500000	-2005.000000	7.100000	6790.500000	1.072232e-01	2.508930e-02	4.760155e-02
75%	227.000000	120.000000	248.000000	701.000000	13000.000000	7.017879e+07	1.346860e+05	16729.250000	2.000000	411.250000	...	7.200000	2.350000	12000.000000	3080.250000	-1999.000000	7.700000	30995.750000	6.801818e-01	6.712008e-01	4.646173e-01
max	813.000000	330.000000	23000.000000	23000.000000	640000.000000	7.605058e+08	1.689764e+06	656730.000000	43.000000	5060.000000	...	9.300000	2.760000	349000.000000	5042.000000	-1921.000000	9.600000	781648.000000	2.685537e+00	2.717220e+00	4.090872e+00

	color	director_name	actor_2_name	genres	actor_1_name	movie_title	actor_3_name	plot_keywords	movie_imdb_link	language	country	content_rating	d_movie_title	dflag
count	3468	3468	3468	3468	3468	3468	3468	3468	3468	3468	3468	3468	3468	3468
unique	2	1533	2028	709	1310	3371	2403	3371	3371	31	43	12	3294	3
top	Color	Steven Spielberg	Morgan Freeman	Comedy\|Drama\|Romance	Robert De Niro	Home	Steve Coogan	eighteen wheeler\|illegal street racing\|truck\|t...	http://www.imdb.com/title/tt1976009/?ref_=fn_t...	English	USA	R	Mad Max: Fury Road	TRUE
freq	3360	25	20	133	42	3	8	3	3	3350	2786	1544	5	3080