In [1]:
import pandas as pd
import matplotlib.pyplot as plt
reviews = pd.read_csv('fandango_scores.csv')
cols = ['FILM', 'RT_user_norm', 'Metacritic_user_nom', 'IMDB_norm', 'Fandango_Ratingvalue']
norm_reviews = reviews[cols]
print(norm_reviews[:5])
In [2]:
fandango_ratings_freq_cnts = norm_reviews['Fandango_Ratingvalue'].value_counts()
fandango_distribution = fandango_ratings_freq_cnts.sort_index()
imdb_freq = norm_reviews['IMDB_norm'].value_counts()
imdb_distribution = imdb_freq.sort_index()
print(fandango_distribution)
print(imdb_distribution)
In [3]:
fig, ax = plt.subplots()
ax.hist(norm_reviews['Fandango_Ratingvalue'], range=(0,5))
print(norm_reviews['Fandango_Ratingvalue'].value_counts().sort_index(ascending=False))
plt.show()
In [7]:
#range for histograms will be same accros all plots
#this is important when analyzing data distribution across multiple columns
#since hist will make bins in range of min and max values for the column
valuespan = (0,5)
fig = plt.figure(figsize=(5,20))
ax1 = fig.add_subplot(4,1,1)
ax2 = fig.add_subplot(4,1,2)
ax3 = fig.add_subplot(4,1,3)
ax4 = fig.add_subplot(4,1,4)
ax1.hist(norm_reviews['Fandango_Ratingvalue'], bins=20, range=valuespan)
ax1.set_title('Distribution of Fandango Ratings')
ax1.set_ylim(0,50)
ax1.set_ylabel('Frequency')
ax2.hist(norm_reviews['RT_user_norm'], bins=20, range=valuespan)
ax2.set_title('Distribution of Rotten Tomatoes Ratings')
ax2.set_ylim(0,50)
ax2.set_ylabel('Frequency')
ax3.hist(norm_reviews['Metacritic_user_nom'], bins=20, range=valuespan)
ax3.set_title('Distribution of Metacritic Ratings')
ax3.set_ylim(0,50)
ax3.set_ylabel('Frequency')
ax4.hist(norm_reviews['IMDB_norm'], bins=20, range=valuespan)
ax4.set_title('Distribution of IMDB Ratings')
ax4.set_ylim(0,50)
ax4.set_ylabel('Frequency')
plt.show()
In [20]:
fig = plt.figure(figsize=(12,10))
ax = fig.add_subplot(2,1,1)
ax.boxplot(norm_reviews['RT_user_norm'])
ax.set_ylim(0,5)
ax.set_xlabel('Rotten Tomatoes')
ax1 = fig.add_subplot(2,1,2)
ax1.boxplot(norm_reviews['IMDB_norm'])
ax1.set_ylim(0,5)
ax1.set_xlabel('IMDB ratings')
plt.show()
In [24]:
#lets plot all box diagrams for all rating columns in the same plot
fig,ax = plt.subplots()
# ignore FILM column cols[0]
ax.boxplot(norm_reviews[cols[1:]].values)
ax.set_xticklabels(cols[1:], rotation=90)
#it is important to set lim on values when making histogram or boxplot for multiple columns, cuz their max and min values can differ
ax.set_ylim(0,5)
plt.show()
In [ ]: