Watch Me Code 1: Superhero Movies

https://raw.githubusercontent.com/mafudge/datasets/master/superhero/superhero-movie-dataset-1978-2012.csv

COLUMNS year, title, comic, imdb, rt, composite, opening_weeked_bo, avg_ticket_price, opening_weekend_attend, us_pop_that_year

read_csv file from web
no column names
head(), sample()
value_counts
dealing with nulls
Feature engineering



In [8]:

    
import pandas as pd

sh = pd.read_csv('https://raw.githubusercontent.com/mafudge/datasets/master/superhero/superhero-movie-dataset-1978-2012.csv')
sh.head()









    Out[8]:







  
    
      
      1978
      Superman
      DC
      7.3
      95
      84
      7465343
      2.34
      3190317.521
      222584545
    
  
  
    
      0
      1980
      Superman II
      DC
      6.7
      88
      77.5
      14100523.0
      2.69
      5241830.112
      227224681
    
    
      1
      1982
      Swamp Thing
      DC
      5.3
      60
      56.5
      NaN
      2.94
      NaN
      231664458
    
    
      2
      1983
      Superman III
      DC
      4.9
      24
      36.5
      13352357.0
      3.15
      4238843.492
      233791994
    
    
      3
      1984
      Supergirl
      DC
      4.2
      8
      25.0
      5738249.0
      3.36
      1707812.202
      235824902
    
    
      4
      1986
      Howard the Duck
      Marvel
      4.3
      16
      29.5
      5070136.0
      3.71
      1366613.477
      240132887



In [9]:

    
# no columns? no sweat!
sh.columns = [ 'year', 'title', 'comic', 'imdb', 'rt', 'composite', 'opening_weeked_bo', 'avg_ticket_price', 'opening_weekend_attend', 'us_pop_that_year']
sh.head()









    Out[9]:







  
    
      
      year
      title
      comic
      imdb
      rt
      composite
      opening_weeked_bo
      avg_ticket_price
      opening_weekend_attend
      us_pop_that_year
    
  
  
    
      0
      1980
      Superman II
      DC
      6.7
      88
      77.5
      14100523.0
      2.69
      5241830.112
      227224681
    
    
      1
      1982
      Swamp Thing
      DC
      5.3
      60
      56.5
      NaN
      2.94
      NaN
      231664458
    
    
      2
      1983
      Superman III
      DC
      4.9
      24
      36.5
      13352357.0
      3.15
      4238843.492
      233791994
    
    
      3
      1984
      Supergirl
      DC
      4.2
      8
      25.0
      5738249.0
      3.36
      1707812.202
      235824902
    
    
      4
      1986
      Howard the Duck
      Marvel
      4.3
      16
      29.5
      5070136.0
      3.71
      1366613.477
      240132887



In [10]:

    
## Who has more movies? DC or Marvel?
sh['comic'].value_counts()









    Out[10]:





Marvel    29
DC        19
Name: comic, dtype: int64



In [11]:

    
## let's see that as a percentage of the total
sh['comic'].value_counts(normalize=True)









    Out[11]:





Marvel    0.604167
DC        0.395833
Name: comic, dtype: float64



In [12]:

    
## what are the ratios in the last 10 years of data ?
sh[ sh['year'] >2002]['comic'].value_counts(normalize=True)









    Out[12]:





Marvel    0.741935
DC        0.258065
Name: comic, dtype: float64



In [13]:

    
# what about the first 10 years of data? 1978 - 1987?
sh[ sh['year'] < 1988]['comic'].value_counts(normalize=True)









    Out[13]:





DC        0.833333
Marvel    0.166667
Name: comic, dtype: float64



In [14]:

    
sh.head()









    Out[14]:







  
    
      
      year
      title
      comic
      imdb
      rt
      composite
      opening_weeked_bo
      avg_ticket_price
      opening_weekend_attend
      us_pop_that_year
    
  
  
    
      0
      1980
      Superman II
      DC
      6.7
      88
      77.5
      14100523.0
      2.69
      5241830.112
      227224681
    
    
      1
      1982
      Swamp Thing
      DC
      5.3
      60
      56.5
      NaN
      2.94
      NaN
      231664458
    
    
      2
      1983
      Superman III
      DC
      4.9
      24
      36.5
      13352357.0
      3.15
      4238843.492
      233791994
    
    
      3
      1984
      Supergirl
      DC
      4.2
      8
      25.0
      5738249.0
      3.36
      1707812.202
      235824902
    
    
      4
      1986
      Howard the Duck
      Marvel
      4.3
      16
      29.5
      5070136.0
      3.71
      1366613.477
      240132887



In [15]:

    
## skip nulls in analysis
sh2 = sh.dropna()
sh2.head()









    Out[15]:







  
    
      
      year
      title
      comic
      imdb
      rt
      composite
      opening_weeked_bo
      avg_ticket_price
      opening_weekend_attend
      us_pop_that_year
    
  
  
    
      0
      1980
      Superman II
      DC
      6.7
      88
      77.5
      14100523.0
      2.69
      5241830.112
      227224681
    
    
      2
      1983
      Superman III
      DC
      4.9
      24
      36.5
      13352357.0
      3.15
      4238843.492
      233791994
    
    
      3
      1984
      Supergirl
      DC
      4.2
      8
      25.0
      5738249.0
      3.36
      1707812.202
      235824902
    
    
      4
      1986
      Howard the Duck
      Marvel
      4.3
      16
      29.5
      5070136.0
      3.71
      1366613.477
      240132887
    
    
      5
      1987
      Superman IV: The Quest for Peace
      DC
      3.6
      10
      23.0
      5683122.0
      3.91
      1453483.887
      242288918



In [16]:

    
# feature engineering: percentage of population seeing the movie.
sh2['pct_of_pop'] = sh2['opening_weekend_attend'] /sh2['us_pop_that_year']
sh2.head()









    



c:\users\mafud\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  






    Out[16]:







  
    
      
      year
      title
      comic
      imdb
      rt
      composite
      opening_weeked_bo
      avg_ticket_price
      opening_weekend_attend
      us_pop_that_year
      pct_of_pop
    
  
  
    
      0
      1980
      Superman II
      DC
      6.7
      88
      77.5
      14100523.0
      2.69
      5241830.112
      227224681
      0.023069
    
    
      2
      1983
      Superman III
      DC
      4.9
      24
      36.5
      13352357.0
      3.15
      4238843.492
      233791994
      0.018131
    
    
      3
      1984
      Supergirl
      DC
      4.2
      8
      25.0
      5738249.0
      3.36
      1707812.202
      235824902
      0.007242
    
    
      4
      1986
      Howard the Duck
      Marvel
      4.3
      16
      29.5
      5070136.0
      3.71
      1366613.477
      240132887
      0.005691
    
    
      5
      1987
      Superman IV: The Quest for Peace
      DC
      3.6
      10
      23.0
      5683122.0
      3.91
      1453483.887
      242288918
      0.005999



In [17]:

    
# Marvel comics with highest opening_weeked_bo
sh2[ sh2['comic'] == 'Marvel' ].sort_values('opening_weeked_bo').tail()









    Out[17]:







  
    
      
      year
      title
      comic
      imdb
      rt
      composite
      opening_weeked_bo
      avg_ticket_price
      opening_weekend_attend
      us_pop_that_year
      pct_of_pop
    
  
  
    
      28
      2006
      X-Men: The Last Stand
      Marvel
      6.8
      57
      62.5
      102750665.0
      6.55
      15687124.43
      298593212
      0.052537
    
    
      16
      2002
      Spider-Man
      Marvel
      7.4
      89
      81.5
      114844116.0
      5.81
      19766629.26
      287803914
      0.068681
    
    
      38
      2010
      Iron Man 2
      Marvel
      7.1
      74
      72.5
      128122480.0
      7.89
      16238590.62
      308745538
      0.052595
    
    
      31
      2007
      Spider-Man 3
      Marvel
      6.3
      63
      63.0
      151116516.0
      6.88
      21964609.88
      301579895
      0.072832
    
    
      44
      2012
      Marvel's The Avengers
      Marvel
      8.7
      92
      89.5
      207438708.0
      7.92
      26191756.06
      314055984
      0.083398



In [19]:

    
# Greatest Percentage of the population seeing the movie? Top 3
sh2.sort_values('pct_of_pop', ascending = False).head(3)









    Out[19]:







  
    
      
      year
      title
      comic
      imdb
      rt
      composite
      opening_weeked_bo
      avg_ticket_price
      opening_weekend_attend
      us_pop_that_year
      pct_of_pop
    
  
  
    
      44
      2012
      Marvel's The Avengers
      Marvel
      8.7
      92
      89.5
      207438708.0
      7.92
      26191756.06
      314055984
      0.083398
    
    
      31
      2007
      Spider-Man 3
      Marvel
      6.3
      63
      63.0
      151116516.0
      6.88
      21964609.88
      301579895
      0.072832
    
    
      32
      2008
      The Dark Knight
      DC
      8.9
      94
      91.5
      158411483.0
      7.18
      22062880.64
      304374846
      0.072486



In [ ]:



In [ ]:

	1978	Superman	DC	7.3	95	84	7465343	2.34	3190317.521	222584545
0	1980	Superman II	DC	6.7	88	77.5	14100523.0	2.69	5241830.112	227224681
1	1982	Swamp Thing	DC	5.3	60	56.5	NaN	2.94	NaN	231664458
2	1983	Superman III	DC	4.9	24	36.5	13352357.0	3.15	4238843.492	233791994
3	1984	Supergirl	DC	4.2	8	25.0	5738249.0	3.36	1707812.202	235824902
4	1986	Howard the Duck	Marvel	4.3	16	29.5	5070136.0	3.71	1366613.477	240132887

	year	title	comic	imdb	rt	composite	opening_weeked_bo	avg_ticket_price	opening_weekend_attend	us_pop_that_year	pct_of_pop
28	2006	X-Men: The Last Stand	Marvel	6.8	57	62.5	102750665.0	6.55	15687124.43	298593212	0.052537
16	2002	Spider-Man	Marvel	7.4	89	81.5	114844116.0	5.81	19766629.26	287803914	0.068681
38	2010	Iron Man 2	Marvel	7.1	74	72.5	128122480.0	7.89	16238590.62	308745538	0.052595
31	2007	Spider-Man 3	Marvel	6.3	63	63.0	151116516.0	6.88	21964609.88	301579895	0.072832
44	2012	Marvel's The Avengers	Marvel	8.7	92	89.5	207438708.0	7.92	26191756.06	314055984	0.083398