In [8]:
import pandas as pd
sh = pd.read_csv('https://raw.githubusercontent.com/mafudge/datasets/master/superhero/superhero-movie-dataset-1978-2012.csv')
sh.head()
Out[8]:
In [9]:
# no columns? no sweat!
sh.columns = [ 'year', 'title', 'comic', 'imdb', 'rt', 'composite', 'opening_weeked_bo', 'avg_ticket_price', 'opening_weekend_attend', 'us_pop_that_year']
sh.head()
Out[9]:
In [10]:
## Who has more movies? DC or Marvel?
sh['comic'].value_counts()
Out[10]:
In [11]:
## let's see that as a percentage of the total
sh['comic'].value_counts(normalize=True)
Out[11]:
In [12]:
## what are the ratios in the last 10 years of data ?
sh[ sh['year'] >2002]['comic'].value_counts(normalize=True)
Out[12]:
In [13]:
# what about the first 10 years of data? 1978 - 1987?
sh[ sh['year'] < 1988]['comic'].value_counts(normalize=True)
Out[13]:
In [14]:
sh.head()
Out[14]:
In [15]:
## skip nulls in analysis
sh2 = sh.dropna()
sh2.head()
Out[15]:
In [16]:
# feature engineering: percentage of population seeing the movie.
sh2['pct_of_pop'] = sh2['opening_weekend_attend'] /sh2['us_pop_that_year']
sh2.head()
Out[16]:
In [17]:
# Marvel comics with highest opening_weeked_bo
sh2[ sh2['comic'] == 'Marvel' ].sort_values('opening_weeked_bo').tail()
Out[17]:
In [19]:
# Greatest Percentage of the population seeing the movie? Top 3
sh2.sort_values('pct_of_pop', ascending = False).head(3)
Out[19]:
In [ ]:
In [ ]: