In [1]:
import pandas
movies = pandas.read_csv("../data/GP11/fandango_score_comparison.csv")
In [2]:
movies.head(5)
Out[2]:
In [3]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(movies["Fandango_Stars"])
Out[3]:
In [4]:
plt.hist(movies["Metacritic_norm_round"])
Out[4]:
In [5]:
import numpy
f_mean = movies["Fandango_Stars"].mean()
m_mean = movies["Metacritic_norm_round"].mean()
f_std = movies["Fandango_Stars"].std()
m_std = movies["Metacritic_norm_round"].std()
f_median = movies["Fandango_Stars"].median()
m_median = movies["Metacritic_norm_round"].median()
print(f_mean)
print(m_mean)
print(f_std)
print(m_std)
print(f_median)
print(m_median)
In [6]:
plt.scatter(movies["Metacritic_norm_round"], movies["Fandango_Stars"])
Out[6]:
In [7]:
movies["fm_diff"] = numpy.abs(movies["Metacritic_norm_round"] - movies["Fandango_Stars"])
In [8]:
movies.sort_values(by="fm_diff", ascending=False).head(5)
Out[8]:
In [9]:
from scipy.stats import pearsonr
r_value, p_value = pearsonr(movies["Fandango_Stars"], movies["Metacritic_norm_round"])
r_value
Out[9]:
In [10]:
from scipy.stats import linregress
slope, intercept, r_value, p_value, stderr_slope = linregress(movies["Metacritic_norm_round"], movies["Fandango_Stars"])
In [11]:
pred = 3 * slope + intercept
pred
Out[11]:
In [12]:
pred_1 = 1 * slope + intercept
pred_5 = 5 * slope + intercept
plt.scatter(movies["Metacritic_norm_round"], movies["Fandango_Stars"])
plt.plot([1,5],[pred_1,pred_5])
plt.xlim(1,5)
plt.show()