In [ ]:
import os
import numpy as np
import pandas as pd
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
import imdb_data_helper

In [ ]:
#JMF 24 Jul 2018: if you want interactive plots, open a terminal and `pip install mpld3 --user`,
#    then uncomment the lines below.
#import mpld3
#mpld3.enable_notebook()

In [ ]:
# Data downloaded from https://www.imdb.com/interfaces/ and cleaned slightly
if not imdb_data_helper.have_title_basics():
    imdb_data_helper.download_title_basics()
if not imdb_data_helper.have_title_ratings():
    imdb_data_helper.download_title_ratings()

In [ ]:
title_basics = imdb_data_helper.get_title_basics()
title_ratings = imdb_data_helper.get_title_ratings()

In [ ]:
title_basics[title_basics['primaryTitle'].str.contains('Jurassic World', na=False)]

In [ ]:
all_genres = set()
def genres_list():
    for genres in title_basics['genres']:
        if type(genres) is str:
            yield genres.split(',')
        
for genres in genres_list():
    all_genres.update(set(genres))

all_genres

In [ ]:
def make_histogram_for_genre(target_genre):
    title_basics_for_genre = title_basics[title_basics['genres'].str.contains(target_genre, na=False)]
    genre_ratings = title_ratings['averageRating'].reindex(title_basics_for_genre.index)
    genre_ratings = genre_ratings.dropna()
    
    #TODO: compute the mean and median rating (np.mean; np.median)
    #TODO: plot vertical lines denoting the mean and median
    #      using plt.axvline (use optional parameter color='r', color='g')
    
    plt.figure()
    bins = np.arange(-0.05, 10.05, 0.1)
    n, bins, _ = plt.hist(genre_ratings, bins=bins, density=True)
    plt.xlabel('Average Rating')
    plt.title("Average Ratings for genre='{target_genre}'".format(target_genre=target_genre))

make_histogram_for_genre('Sci-Fi')
make_histogram_for_genre('Western')

Q: Are there any "outliers" that you can see in any of the histograms?

Q: Which genre has the highest mean rating? Which has the lowest mean rating?