In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import statsmodels.api as sm
from statsmodels.formula.api import logit, glm, ols
from random import random
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import math
from sklearn.svm import LinearSVC

In [3]:
with open("reviews_df_jason.json","r") as fp:
    df_byreviews = json.load(fp)
df_byreviews=pd.DataFrame(df_byreviews)

In [4]:
df_byreviews.head()


Out[4]:
adj_gross adj_gross_bin adj_opening_gross adj_opening_gross_bin close_date close_year count max mean min movie_id n_scorables open_date open_year opening_theaters pct_scorables rank review_id review_title stars text theaters title topic valence_avg valence_sum year
0 3.011865e+08 1 18004075.757142 1 0.000000e+00 1990 1 0.921705 0.921705 0.921705 Home Alone 144 632448000000 1990 1202 0.701389 1 0 My favourite Christmas movie! 10 it just isn't christmas if i don't watch home... 2173 Home Alone 1 0.758622 76.620792 1990
1 3.011865e+08 1 18004075.757142 1 0.000000e+00 1990 1 0.922908 0.922908 0.922908 Home Alone 65 632448000000 1990 1202 0.738462 1 1 Very funny family movie 8 this is a very funny movie for kids and peopl... 2173 Home Alone 1 1.408094 67.588495 1990
10 8.113545e+06 0 61087.601573 0 0.000000e+00 1990 1 0.090731 0.090731 0.090731 P.S. 364 633052800000 1990 2 0.434066 109 10 Emasculation Proclamation 5 almost but not quite benniferidiotically ta... 240 Mr. & Mrs. Bridge 0 0.237925 37.592131 1990
100 1.209626e+06 0 111846.786031 0 1.137542e+12 2006 1 0.946521 0.946521 0.946521 Stuck 87 1136851200000 2006 7 0.482759 251 100 Dusting off great LA memoirs! 10 john fante wrote a great book called ask the ... 111 Ask the Dust 0 0.469998 19.739933 2006
10000 1.063991e+05 0 20125.559021 0 1.137024e+12 2006 1 0.938600 0.938600 0.938600 Sliver 183 1137715200000 2006 3 0.677596 404 10005 Touching, inspiring, hits close to home, no ma... 10 i knew i would love this movie and the charac... 10 The Real Dirt on Farmer John 1 0.597825 74.130280 2006

In [11]:
opening_gross = df_byreviews.groupby("movie_id").adj_opening_gross.mean()
stars = df_byreviews.groupby("movie_id").stars.mean()

In [39]:
opening_gross.mean()


Out[39]:
13540552.123553481

In [40]:
plt.hist(opening_gross,bins=10000,alpha=0.5,label="Opening gross")
plt.axvline(x=opening_gross.mean(), label = "Mean", color = "r")

plt.ylim(0, 50)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')


Out[40]:
<matplotlib.legend.Legend at 0x1c5172050>

In [66]:
year_movies = df_byreviews.groupby("movie_id").year.mean()

In [67]:
plt.hist(year_movies, bins=26,alpha=0.5,label="Movies")
# plt.axhline(y=avg_number, label = "Mean", color = "b")

plt.title("Number of movies released from 1990 - 2015")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.legend(loc='upper right')


Out[67]:
<matplotlib.legend.Legend at 0x1c81f9350>

In [69]:
opening_gross_2000s = df_byreviews[df_byreviews.year >=2000].groupby("movie_id").adj_opening_gross.mean()
opening_gross_1990s = df_byreviews[df_byreviews.year < 2000].groupby("movie_id").adj_opening_gross.mean()

In [81]:
plt.hist(opening_gross_2000s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000s")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")

plt.ylim(0, 100)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')


Out[81]:
<matplotlib.legend.Legend at 0x24dbbfe10>

In [82]:
plt.hist(opening_gross_2000s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000s")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")

plt.ylim(0, 100)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')


Out[82]:
<matplotlib.legend.Legend at 0x251013c90>

In [84]:
opening_gross_00s = df_byreviews[df_byreviews.year >=2000][df_byreviews.year <2010].groupby("movie_id").adj_opening_gross.mean()
opening_gross_10s = df_byreviews[df_byreviews.year >=2010].groupby("movie_id").adj_opening_gross.mean()


//anaconda/lib/python2.7/site-packages/pandas/core/frame.py:1712: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)

In [86]:
plt.hist(opening_gross_10s,bins=1000, alpha=0.5, color = "g", label="Opening gross for movies post 2010s")
plt.hist(opening_gross_00s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000 - 2010")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")

plt.ylim(0, 30)
plt.xlim(31217400.17,)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')


Out[86]:
<matplotlib.legend.Legend at 0x264490d10>

In [88]:
review_count = df_byreviews.groupby("movie_id").review_id.count()

In [90]:
review_count.mean()


Out[90]:
45.980658599484229

In [94]:
plt.hist(review_count, bins = 50, alpha = 0.5, color = "b", label="Number of reviews")
plt.axhline(y=review_count.mean(), color = "r", label = "Average number of reviews")

plt.xlim(0,100)
plt.title("Movie reviews")
plt.xlabel("Number of reviews")
plt.ylabel("Frequency")
plt.legend(loc='upper right')


Out[94]:
<matplotlib.legend.Legend at 0x26ce48290>

In [104]:
plt.scatter(df_byreviews.n_scorables, df_byreviews.valence_avg)

plt.title("Valence average vs. number of words within the review that were scored")
plt.xlabel("Number of words scorable")
plt.ylabel("Valence average for review")


Out[104]:
<matplotlib.text.Text at 0x26d637590>

In [103]:
plt.scatter(df_byreviews["mean"], df_byreviews["valence_avg"], color = "r")

plt.title("Sentiment analysis gives more nuance than a valence average.")
plt.xlabel("Probability of positive from sentiment analysis")
plt.ylabel("Valence_Avg from simple sentiment dictionary")


Out[103]:
<matplotlib.text.Text at 0x26f0b3bd0>

In [17]:
plt.scatter(opening_gross, stars)
plt.axvline(x=31217400.17)
# graph_val_avg = plt.hist(dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==0],bins=40,alpha=0.5,label="Non-Blockbuster reviews")
# graph_val_avg = plt.axvline(x=dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==1].mean(), color='red',alpha=0.9,label="Blockbuster mean")
# graph_val_avg = plt.axvline(x=dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==0].mean(), color='blue',alpha=0.5,label="Non-Blockbuster mean")

# graph_val_avg = plt.title("Valence sum for positive and negative reviews")
# graph_val_avg = plt.xlabel("Sum of valence")
# graph_val_avg = plt.ylabel("Frequency")
# graph_val_avg = plt.legend(loc='upper right')


Out[17]:
<matplotlib.lines.Line2D at 0x1cd57fe10>

In [ ]: