In [ ]:
import os
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import imdb_data_helper
In [ ]:
#JMF 24 Jul 2018: if you want interactive plots, open a terminal and `pip install mpld3 --user`,
# then uncomment the lines below.
#import mpld3
#mpld3.enable_notebook()
In [ ]:
# Data downloaded from https://www.imdb.com/interfaces/ and cleaned slightly
if not imdb_data_helper.have_title_basics():
imdb_data_helper.download_title_basics()
if not imdb_data_helper.have_title_ratings():
imdb_data_helper.download_title_ratings()
In [ ]:
title_basics = imdb_data_helper.get_title_basics()
title_ratings = imdb_data_helper.get_title_ratings()
In [ ]:
title_basics[title_basics['primaryTitle'].str.contains('Jurassic World', na=False)]
In [ ]:
all_genres = set()
def genres_list():
for genres in title_basics['genres']:
if type(genres) is str:
yield genres.split(',')
for genres in genres_list():
all_genres.update(set(genres))
all_genres
In [ ]:
def make_histogram_for_genre(target_genre):
title_basics_for_genre = title_basics[title_basics['genres'].str.contains(target_genre, na=False)]
genre_ratings = title_ratings['averageRating'].reindex(title_basics_for_genre.index)
genre_ratings = genre_ratings.dropna()
#TODO: compute the mean and median rating (np.mean; np.median)
#TODO: plot vertical lines denoting the mean and median
# using plt.axvline (use optional parameter color='r', color='g')
plt.figure()
bins = np.arange(-0.05, 10.05, 0.1)
n, bins, _ = plt.hist(genre_ratings, bins=bins, density=True)
plt.xlabel('Average Rating')
plt.title("Average Ratings for genre='{target_genre}'".format(target_genre=target_genre))
make_histogram_for_genre('Sci-Fi')
make_histogram_for_genre('Western')
Q: Are there any "outliers" that you can see in any of the histograms?
Q: Which genre has the highest mean rating? Which has the lowest mean rating?