In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import statsmodels.api as sm
from statsmodels.formula.api import logit, glm, ols
from random import random
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import math
from sklearn.svm import LinearSVC
In [3]:
with open("reviews_df_jason.json","r") as fp:
df_byreviews = json.load(fp)
df_byreviews=pd.DataFrame(df_byreviews)
In [4]:
df_byreviews.head()
Out[4]:
In [11]:
opening_gross = df_byreviews.groupby("movie_id").adj_opening_gross.mean()
stars = df_byreviews.groupby("movie_id").stars.mean()
In [39]:
opening_gross.mean()
Out[39]:
In [40]:
plt.hist(opening_gross,bins=10000,alpha=0.5,label="Opening gross")
plt.axvline(x=opening_gross.mean(), label = "Mean", color = "r")
plt.ylim(0, 50)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')
Out[40]:
In [66]:
year_movies = df_byreviews.groupby("movie_id").year.mean()
In [67]:
plt.hist(year_movies, bins=26,alpha=0.5,label="Movies")
# plt.axhline(y=avg_number, label = "Mean", color = "b")
plt.title("Number of movies released from 1990 - 2015")
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.legend(loc='upper right')
Out[67]:
In [69]:
opening_gross_2000s = df_byreviews[df_byreviews.year >=2000].groupby("movie_id").adj_opening_gross.mean()
opening_gross_1990s = df_byreviews[df_byreviews.year < 2000].groupby("movie_id").adj_opening_gross.mean()
In [81]:
plt.hist(opening_gross_2000s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000s")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")
plt.ylim(0, 100)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')
Out[81]:
In [82]:
plt.hist(opening_gross_2000s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000s")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")
plt.ylim(0, 100)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')
Out[82]:
In [84]:
opening_gross_00s = df_byreviews[df_byreviews.year >=2000][df_byreviews.year <2010].groupby("movie_id").adj_opening_gross.mean()
opening_gross_10s = df_byreviews[df_byreviews.year >=2010].groupby("movie_id").adj_opening_gross.mean()
In [86]:
plt.hist(opening_gross_10s,bins=1000, alpha=0.5, color = "g", label="Opening gross for movies post 2010s")
plt.hist(opening_gross_00s,bins=1000, alpha=0.5, color = "r", label="Opening gross for movies post 2000 - 2010")
plt.hist(opening_gross_1990s,bins=1000,alpha=0.5, color = "k", label="Opening gross for movies pre 2000s")
# plt.axvline(x=opening_gross.mean(), label = "Mean")
plt.axvline(x=31217400.17, label = "Opening gross for Jaws (1975)")
plt.ylim(0, 30)
plt.xlim(31217400.17,)
plt.title("Distribution of movie's opening gross (adjusted for inflation) 1990 - 2015")
plt.xlabel("Opening gross (adjusted for inflation)")
plt.ylabel("Frequency")
plt.legend(loc='upper right')
Out[86]:
In [88]:
review_count = df_byreviews.groupby("movie_id").review_id.count()
In [90]:
review_count.mean()
Out[90]:
In [94]:
plt.hist(review_count, bins = 50, alpha = 0.5, color = "b", label="Number of reviews")
plt.axhline(y=review_count.mean(), color = "r", label = "Average number of reviews")
plt.xlim(0,100)
plt.title("Movie reviews")
plt.xlabel("Number of reviews")
plt.ylabel("Frequency")
plt.legend(loc='upper right')
Out[94]:
In [104]:
plt.scatter(df_byreviews.n_scorables, df_byreviews.valence_avg)
plt.title("Valence average vs. number of words within the review that were scored")
plt.xlabel("Number of words scorable")
plt.ylabel("Valence average for review")
Out[104]:
In [103]:
plt.scatter(df_byreviews["mean"], df_byreviews["valence_avg"], color = "r")
plt.title("Sentiment analysis gives more nuance than a valence average.")
plt.xlabel("Probability of positive from sentiment analysis")
plt.ylabel("Valence_Avg from simple sentiment dictionary")
Out[103]:
In [17]:
plt.scatter(opening_gross, stars)
plt.axvline(x=31217400.17)
# graph_val_avg = plt.hist(dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==0],bins=40,alpha=0.5,label="Non-Blockbuster reviews")
# graph_val_avg = plt.axvline(x=dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==1].mean(), color='red',alpha=0.9,label="Blockbuster mean")
# graph_val_avg = plt.axvline(x=dftouse.abs_valence_avg[dftouse.adj_opening_gross_bin ==0].mean(), color='blue',alpha=0.5,label="Non-Blockbuster mean")
# graph_val_avg = plt.title("Valence sum for positive and negative reviews")
# graph_val_avg = plt.xlabel("Sum of valence")
# graph_val_avg = plt.ylabel("Frequency")
# graph_val_avg = plt.legend(loc='upper right')
Out[17]:
In [ ]: