notebook.community

Edit and run



In [1]:

    
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import statsmodels.api as sm
from statsmodels.formula.api import logit, glm, ols
from random import random
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import math
from sklearn.svm import LinearSVC



In [3]:

    
with open("reviews_df_jason.json","r") as fp:
    df_byreviews = json.load(fp)
df_byreviews=pd.DataFrame(df_byreviews)



In [4]:

    
df_byreviews.head()









    Out[4]:






  
    
      
      adj_gross
      adj_gross_bin
      adj_opening_gross
      adj_opening_gross_bin
      close_date
      close_year
      count
      max
      mean
      min
      movie_id
      n_scorables
      open_date
      open_year
      opening_theaters
      pct_scorables
      rank
      review_id
      review_title
      stars
      text
      theaters
      title
      topic
      valence_avg
      valence_sum
      year
    
  
  
    
      0
       3.011865e+08
       1
       18004075.757142
       1
       0.000000e+00
       1990
       1
       0.921705
       0.921705
       0.921705
       Home Alone
       144
        632448000000
       1990
       1202
       0.701389
         1
           0
                           My favourite Christmas movie!
       10
        it just isn't christmas if i don't watch home...
       2173
                         Home Alone
       1
       0.758622
       76.620792
       1990
    
    
      1
       3.011865e+08
       1
       18004075.757142
       1
       0.000000e+00
       1990
       1
       0.922908
       0.922908
       0.922908
       Home Alone
        65
        632448000000
       1990
       1202
       0.738462
         1
           1
                                 Very funny family movie
        8
        this is a very funny movie for kids and peopl...
       2173
                         Home Alone
       1
       1.408094
       67.588495
       1990
    
    
      10
       8.113545e+06
       0
          61087.601573
       0
       0.000000e+00
       1990
       1
       0.090731
       0.090731
       0.090731
             P.S.
       364
        633052800000
       1990
          2
       0.434066
       109
          10
                               Emasculation Proclamation
        5
        almost  but not quite  benniferidiotically ta...
        240
                  Mr. & Mrs. Bridge
       0
       0.237925
       37.592131
       1990
    
    
      100
       1.209626e+06
       0
         111846.786031
       0
       1.137542e+12
       2006
       1
       0.946521
       0.946521
       0.946521
            Stuck
        87
       1136851200000
       2006
          7
       0.482759
       251
         100
                           Dusting off great LA memoirs!
       10
        john fante wrote a great book called ask the ...
        111
                       Ask the Dust
       0
       0.469998
       19.739933
       2006
    
    
      10000
       1.063991e+05
       0
          20125.559021
       0
       1.137024e+12
       2006
       1
       0.938600
       0.938600
       0.938600
           Sliver
       183
       1137715200000
       2006
          3
       0.677596
       404
       10005
       Touching, inspiring, hits close to home, no ma...
       10
        i knew i would love this movie and the charac...
         10
       The Real Dirt on Farmer John
       1
       0.597825
       74.130280
       2006



In [11]:

    
opening_gross = df_byreviews.groupby("movie_id").adj_opening_gross.mean()
stars = df_byreviews.groupby("movie_id").stars.mean()



In [39]:

    
opening_gross.mean()









    Out[39]:





13540552.123553481



In [40]:

    
plt.hist(opening_gross,bins=10000,alpha=0.5,label="Opening gross")
plt.axvline(x=opening_gross.mean(), label = "Mean", color = "r")

plt.ylim(0, 50)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')









    Out[40]:





<matplotlib.legend.Legend at 0x1c5172050>



In [66]:

    
year_movies = df_byreviews.groupby("movie_id").year.mean()



In [67]:

    
plt.hist(year_movies, bins=26,alpha=0.5,label="Movies")
# plt.axhline(y=avg_number, label = "Mean", color = "b")

plt.title("Number of movies released from 1990 - 2015")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.legend(loc='upper right')









    Out[67]:





<matplotlib.legend.Legend at 0x1c81f9350>



In [69]:

    
opening_gross_2000s = df_byreviews[df_byreviews.year >=2000].groupby("movie_id").adj_opening_gross.mean()
opening_gross_1990s = df_byreviews[df_byreviews.year < 2000].groupby("movie_id").adj_opening_gross.mean()



In [81]:

    
plt.hist(opening_gross_2000s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000s")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")

plt.ylim(0, 100)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')









    Out[81]:





<matplotlib.legend.Legend at 0x24dbbfe10>



In [82]:

    
plt.hist(opening_gross_2000s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000s")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")

plt.ylim(0, 100)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')









    Out[82]:





<matplotlib.legend.Legend at 0x251013c90>



In [84]:

    
opening_gross_00s = df_byreviews[df_byreviews.year >=2000][df_byreviews.year <2010].groupby("movie_id").adj_opening_gross.mean()
opening_gross_10s = df_byreviews[df_byreviews.year >=2010].groupby("movie_id").adj_opening_gross.mean()









    



//anaconda/lib/python2.7/site-packages/pandas/core/frame.py:1712: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)



In [86]:

    
plt.hist(opening_gross_10s,bins=1000, alpha=0.5, color = "g", label="Opening gross for movies post 2010s")
plt.hist(opening_gross_00s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000 - 2010")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")

plt.ylim(0, 30)
plt.xlim(31217400.17,)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')









    Out[86]:





<matplotlib.legend.Legend at 0x264490d10>



In [88]:

    
review_count = df_byreviews.groupby("movie_id").review_id.count()



In [90]:

    
review_count.mean()









    Out[90]:





45.980658599484229



In [94]:

    
plt.hist(review_count, bins = 50, alpha = 0.5, color = "b", label="Number of reviews")
plt.axhline(y=review_count.mean(), color = "r", label = "Average number of reviews")

plt.xlim(0,100)
plt.title("Movie reviews")
plt.xlabel("Number of reviews")
plt.ylabel("Frequency")
plt.legend(loc='upper right')









    Out[94]:





<matplotlib.legend.Legend at 0x26ce48290>



In [104]:

    
plt.scatter(df_byreviews.n_scorables, df_byreviews.valence_avg)

plt.title("Valence average vs. number of words within the review that were scored")
plt.xlabel("Number of words scorable")
plt.ylabel("Valence average for review")









    Out[104]:





<matplotlib.text.Text at 0x26d637590>



In [103]:

    
plt.scatter(df_byreviews["mean"], df_byreviews["valence_avg"], color = "r")

plt.title("Sentiment analysis gives more nuance than a valence average.")
plt.xlabel("Probability of positive from sentiment analysis")
plt.ylabel("Valence_Avg from simple sentiment dictionary")









    Out[103]:





<matplotlib.text.Text at 0x26f0b3bd0>



In [17]:

    
plt.scatter(opening_gross, stars)
plt.axvline(x=31217400.17)
# graph_val_avg = plt.hist(dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==0],bins=40,alpha=0.5,label="Non-Blockbuster reviews")
# graph_val_avg = plt.axvline(x=dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==1].mean(), color='red',alpha=0.9,label="Blockbuster mean")
# graph_val_avg = plt.axvline(x=dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==0].mean(), color='blue',alpha=0.5,label="Non-Blockbuster mean")

# graph_val_avg = plt.title("Valence sum for positive and negative reviews")
# graph_val_avg = plt.xlabel("Sum of valence")
# graph_val_avg = plt.ylabel("Frequency")
# graph_val_avg = plt.legend(loc='upper right')









    Out[17]:





<matplotlib.lines.Line2D at 0x1cd57fe10>



In [ ]:

	adj_gross	adj_gross_bin	adj_opening_gross	adj_opening_gross_bin	close_date	close_year	count	max	mean	min	movie_id	n_scorables	open_date	open_year	opening_theaters	pct_scorables	rank	review_id	review_title	stars	text	theaters	title	topic	valence_avg	valence_sum	year
0	3.011865e+08	1	18004075.757142	1	0.000000e+00	1990	1	0.921705	0.921705	0.921705	Home Alone	144	632448000000	1990	1202	0.701389	1	0	My favourite Christmas movie!	10	it just isn't christmas if i don't watch home...	2173	Home Alone	1	0.758622	76.620792	1990
1	3.011865e+08	1	18004075.757142	1	0.000000e+00	1990	1	0.922908	0.922908	0.922908	Home Alone	65	632448000000	1990	1202	0.738462	1	1	Very funny family movie	8	this is a very funny movie for kids and peopl...	2173	Home Alone	1	1.408094	67.588495	1990
10	8.113545e+06	0	61087.601573	0	0.000000e+00	1990	1	0.090731	0.090731	0.090731	P.S.	364	633052800000	1990	2	0.434066	109	10	Emasculation Proclamation	5	almost but not quite benniferidiotically ta...	240	Mr. & Mrs. Bridge	0	0.237925	37.592131	1990
100	1.209626e+06	0	111846.786031	0	1.137542e+12	2006	1	0.946521	0.946521	0.946521	Stuck	87	1136851200000	2006	7	0.482759	251	100	Dusting off great LA memoirs!	10	john fante wrote a great book called ask the ...	111	Ask the Dust	0	0.469998	19.739933	2006
10000	1.063991e+05	0	20125.559021	0	1.137024e+12	2006	1	0.938600	0.938600	0.938600	Sliver	183	1137715200000	2006	3	0.677596	404	10005	Touching, inspiring, hits close to home, no ma...	10	i knew i would love this movie and the charac...	10	The Real Dirt on Farmer John	1	0.597825	74.130280	2006