Watch Me Code 1: Superhero Movies

https://raw.githubusercontent.com/mafudge/datasets/master/superhero/superhero-movie-dataset-1978-2012.csv

COLUMNS year, title, comic, imdb, rt, composite, opening_weeked_bo, avg_ticket_price, opening_weekend_attend, us_pop_that_year

  • read_csv file from web
  • no column names
  • head(), sample()
  • value_counts
  • dealing with nulls
  • Feature engineering

In [8]:
import pandas as pd

sh = pd.read_csv('https://raw.githubusercontent.com/mafudge/datasets/master/superhero/superhero-movie-dataset-1978-2012.csv')
sh.head()


Out[8]:
1978 Superman DC 7.3 95 84 7465343 2.34 3190317.521 222584545
0 1980 Superman II DC 6.7 88 77.5 14100523.0 2.69 5241830.112 227224681
1 1982 Swamp Thing DC 5.3 60 56.5 NaN 2.94 NaN 231664458
2 1983 Superman III DC 4.9 24 36.5 13352357.0 3.15 4238843.492 233791994
3 1984 Supergirl DC 4.2 8 25.0 5738249.0 3.36 1707812.202 235824902
4 1986 Howard the Duck Marvel 4.3 16 29.5 5070136.0 3.71 1366613.477 240132887

In [9]:
# no columns? no sweat!
sh.columns = [ 'year', 'title', 'comic', 'imdb', 'rt', 'composite', 'opening_weeked_bo', 'avg_ticket_price', 'opening_weekend_attend', 'us_pop_that_year']
sh.head()


Out[9]:
year title comic imdb rt composite opening_weeked_bo avg_ticket_price opening_weekend_attend us_pop_that_year
0 1980 Superman II DC 6.7 88 77.5 14100523.0 2.69 5241830.112 227224681
1 1982 Swamp Thing DC 5.3 60 56.5 NaN 2.94 NaN 231664458
2 1983 Superman III DC 4.9 24 36.5 13352357.0 3.15 4238843.492 233791994
3 1984 Supergirl DC 4.2 8 25.0 5738249.0 3.36 1707812.202 235824902
4 1986 Howard the Duck Marvel 4.3 16 29.5 5070136.0 3.71 1366613.477 240132887

In [10]:
## Who has more movies? DC or Marvel?
sh['comic'].value_counts()


Out[10]:
Marvel    29
DC        19
Name: comic, dtype: int64

In [11]:
## let's see that as a percentage of the total
sh['comic'].value_counts(normalize=True)


Out[11]:
Marvel    0.604167
DC        0.395833
Name: comic, dtype: float64

In [12]:
## what are the ratios in the last 10 years of data ?
sh[ sh['year'] >2002]['comic'].value_counts(normalize=True)


Out[12]:
Marvel    0.741935
DC        0.258065
Name: comic, dtype: float64

In [13]:
# what about the first 10 years of data? 1978 - 1987?
sh[ sh['year'] < 1988]['comic'].value_counts(normalize=True)


Out[13]:
DC        0.833333
Marvel    0.166667
Name: comic, dtype: float64

In [14]:
sh.head()


Out[14]:
year title comic imdb rt composite opening_weeked_bo avg_ticket_price opening_weekend_attend us_pop_that_year
0 1980 Superman II DC 6.7 88 77.5 14100523.0 2.69 5241830.112 227224681
1 1982 Swamp Thing DC 5.3 60 56.5 NaN 2.94 NaN 231664458
2 1983 Superman III DC 4.9 24 36.5 13352357.0 3.15 4238843.492 233791994
3 1984 Supergirl DC 4.2 8 25.0 5738249.0 3.36 1707812.202 235824902
4 1986 Howard the Duck Marvel 4.3 16 29.5 5070136.0 3.71 1366613.477 240132887

In [15]:
## skip nulls in analysis
sh2 = sh.dropna()
sh2.head()


Out[15]:
year title comic imdb rt composite opening_weeked_bo avg_ticket_price opening_weekend_attend us_pop_that_year
0 1980 Superman II DC 6.7 88 77.5 14100523.0 2.69 5241830.112 227224681
2 1983 Superman III DC 4.9 24 36.5 13352357.0 3.15 4238843.492 233791994
3 1984 Supergirl DC 4.2 8 25.0 5738249.0 3.36 1707812.202 235824902
4 1986 Howard the Duck Marvel 4.3 16 29.5 5070136.0 3.71 1366613.477 240132887
5 1987 Superman IV: The Quest for Peace DC 3.6 10 23.0 5683122.0 3.91 1453483.887 242288918

In [16]:
# feature engineering: percentage of population seeing the movie.
sh2['pct_of_pop'] = sh2['opening_weekend_attend'] /sh2['us_pop_that_year']
sh2.head()


c:\users\mafud\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
Out[16]:
year title comic imdb rt composite opening_weeked_bo avg_ticket_price opening_weekend_attend us_pop_that_year pct_of_pop
0 1980 Superman II DC 6.7 88 77.5 14100523.0 2.69 5241830.112 227224681 0.023069
2 1983 Superman III DC 4.9 24 36.5 13352357.0 3.15 4238843.492 233791994 0.018131
3 1984 Supergirl DC 4.2 8 25.0 5738249.0 3.36 1707812.202 235824902 0.007242
4 1986 Howard the Duck Marvel 4.3 16 29.5 5070136.0 3.71 1366613.477 240132887 0.005691
5 1987 Superman IV: The Quest for Peace DC 3.6 10 23.0 5683122.0 3.91 1453483.887 242288918 0.005999

In [17]:
# Marvel comics with highest opening_weeked_bo
sh2[ sh2['comic'] == 'Marvel' ].sort_values('opening_weeked_bo').tail()


Out[17]:
year title comic imdb rt composite opening_weeked_bo avg_ticket_price opening_weekend_attend us_pop_that_year pct_of_pop
28 2006 X-Men: The Last Stand Marvel 6.8 57 62.5 102750665.0 6.55 15687124.43 298593212 0.052537
16 2002 Spider-Man Marvel 7.4 89 81.5 114844116.0 5.81 19766629.26 287803914 0.068681
38 2010 Iron Man 2 Marvel 7.1 74 72.5 128122480.0 7.89 16238590.62 308745538 0.052595
31 2007 Spider-Man 3 Marvel 6.3 63 63.0 151116516.0 6.88 21964609.88 301579895 0.072832
44 2012 Marvel's The Avengers Marvel 8.7 92 89.5 207438708.0 7.92 26191756.06 314055984 0.083398

In [19]:
# Greatest Percentage of the population seeing the movie? Top 3
sh2.sort_values('pct_of_pop', ascending = False).head(3)


Out[19]:
year title comic imdb rt composite opening_weeked_bo avg_ticket_price opening_weekend_attend us_pop_that_year pct_of_pop
44 2012 Marvel's The Avengers Marvel 8.7 92 89.5 207438708.0 7.92 26191756.06 314055984 0.083398
31 2007 Spider-Man 3 Marvel 6.3 63 63.0 151116516.0 6.88 21964609.88 301579895 0.072832
32 2008 The Dark Knight DC 8.9 94 91.5 158411483.0 7.18 22062880.64 304374846 0.072486

In [ ]:


In [ ]: