notebook.community

Edit and run



In [30]:

    
# %sh

# wget https://raw.githubusercontent.com/fivethirtyeight/data/master/star-wars-survey/StarWars.csv
# mv StarWars.csv star_wars.csv



In [31]:

    
import sys

reload(sys)
sys.setdefaultencoding('utf8')
sys.getdefaultencoding()



In [32]:

    
import pandas as pd

star_wars = pd.read_csv("star_wars.csv")
print(star_wars.shape)



In [33]:

    
print(star_wars.head(3))



In [34]:

    
print(star_wars.columns)



In [35]:

    
star_wars = star_wars[pd.notnull(star_wars["RespondentID"])]
print(star_wars.shape)



In [36]:

    
for column in list(star_wars):
#   print("{0}:".format(column))
  print(star_wars[column].value_counts(dropna=False))



In [37]:

    
import numpy as np

yes_no = {
  "Yes": True,
  "No": False, 
  True: True, 
  False: False, 
}

seen_columns = {
  "Which of the following Star Wars films have you seen? Please select all that apply.": "seen_1", 
  "Unnamed: 4": "seen_2", 
  "Unnamed: 5": "seen_3", 
  "Unnamed: 6": "seen_4", 
  "Unnamed: 7": "seen_5", 
  "Unnamed: 8": "seen_6"
}

seen_movies = {
  "Star Wars: Episode I  The Phantom Menace": True, 
  "Star Wars: Episode II  Attack of the Clones": True, 
  "Star Wars: Episode III  Revenge of the Sith": True, 
  "Star Wars: Episode IV  A New Hope": True, 
  "Star Wars: Episode V The Empire Strikes Back": True, 
  "Star Wars: Episode VI Return of the Jedi": True, 
  np.nan: False, 
}

# star_wars["Have you seen any of the 6 films in the Star Wars franchise?"] = star_wars["Have you seen any of the 6 films in the Star Wars franchise?"].map(yes_no, na_action="ignore")
# star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"] = star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"].map(yes_no, na_action="ignore")

def clean_columns(df, columns, mapping):
  for column in columns:
    df[column] = df[column].map(mapping, na_action="ignore")
    print(column)
    print("{0}\n".format(df[column].value_counts(dropna=False)))

clean_columns(star_wars, star_wars.columns[1:3], yes_no)
star_wars.rename(columns=seen_columns, inplace=True)
clean_columns(star_wars, star_wars.columns[3:9], dict(seen_movies.items() + yes_no.items()))

star_wars.head(3)









    Out[37]:






  
    
      
      RespondentID
      Have you seen any of the 6 films in the Star Wars franchise?
      Do you consider yourself to be a fan of the Star Wars film franchise?
      seen_1
      seen_2
      seen_3
      seen_4
      seen_5
      seen_6
      Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.
      ...
      Unnamed: 28
      Which character shot first?
      Are you familiar with the Expanded Universe?
      Do you consider yourself to be a fan of the Expanded Universe?��
      Do you consider yourself to be a fan of the Star Trek franchise?
      Gender
      Age
      Household Income
      Education
      Location (Census Region)
    
  
  
    
      1
      3.292880e+09
      True
      True
      True
      True
      True
      True
      True
      True
      3
      ...
      Very favorably
      I don't understand this question
      Yes
      No
      No
      Male
      18-29
      NaN
      High school degree
      South Atlantic
    
    
      2
      3.292880e+09
      False
      NaN
      False
      False
      False
      False
      False
      False
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      Yes
      Male
      18-29
      $0 - $24,999
      Bachelor degree
      West South Central
    
    
      3
      3.292765e+09
      True
      False
      True
      True
      True
      False
      False
      False
      1
      ...
      Unfamiliar (N/A)
      I don't understand this question
      No
      NaN
      No
      Male
      18-29
      $0 - $24,999
      High school degree
      West North Central
    
  

3 rows × 38 columns



In [38]:

    
star_wars.iloc[:,9:15] = star_wars.iloc[:,9:15].astype(float)

ranking_columns = {
  "Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.": "ranking_1", 
  "Unnamed: 10": "ranking_2", 
  "Unnamed: 11": "ranking_3", 
  "Unnamed: 12": "ranking_4", 
  "Unnamed: 13": "ranking_5", 
  "Unnamed: 14": "ranking_6", 
}

star_wars.rename(columns=ranking_columns, inplace=True)

star_wars.head(3)









    Out[38]:






  
    
      
      RespondentID
      Have you seen any of the 6 films in the Star Wars franchise?
      Do you consider yourself to be a fan of the Star Wars film franchise?
      seen_1
      seen_2
      seen_3
      seen_4
      seen_5
      seen_6
      ranking_1
      ...
      Unnamed: 28
      Which character shot first?
      Are you familiar with the Expanded Universe?
      Do you consider yourself to be a fan of the Expanded Universe?��
      Do you consider yourself to be a fan of the Star Trek franchise?
      Gender
      Age
      Household Income
      Education
      Location (Census Region)
    
  
  
    
      1
      3.292880e+09
      True
      True
      True
      True
      True
      True
      True
      True
      3.0
      ...
      Very favorably
      I don't understand this question
      Yes
      No
      No
      Male
      18-29
      NaN
      High school degree
      South Atlantic
    
    
      2
      3.292880e+09
      False
      NaN
      False
      False
      False
      False
      False
      False
      NaN
      ...
      NaN
      NaN
      NaN
      NaN
      Yes
      Male
      18-29
      $0 - $24,999
      Bachelor degree
      West South Central
    
    
      3
      3.292765e+09
      True
      False
      True
      True
      True
      False
      False
      False
      1.0
      ...
      Unfamiliar (N/A)
      I don't understand this question
      No
      NaN
      No
      Male
      18-29
      $0 - $24,999
      High school degree
      West North Central
    
  

3 rows × 38 columns



In [60]:

    
# Find the ranking of the movies
import matplotlib.pyplot as plt
%matplotlib inline

# Get movie names
movie_names = filter(lambda x: x is not np.nan, list(seen_movies.keys()))
movie_names.sort()

ranking = star_wars.iloc[:,9:15].mean()
ranking.index = movie_names
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ranking.plot(kind="bar")
# display(fig)
fig.show()



In [64]:

    
# Find the most seen movies
seen = star_wars.iloc[:,3:9].sum()
seen.index = movie_names

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
seen.plot(kind="bar")
# display(fig)
fig.show()



In [71]:

    
males = star_wars[star_wars["Gender"] == "Male"]
females = star_wars[star_wars["Gender"] == "Female"]

male_ranking = males.iloc[:,9:15].mean()
female_ranking = females.iloc[:,9:15].mean()
total_ranking = pd.concat([male_ranking, female_ranking], axis=1)
total_ranking.index = movie_names

total_ranking.plot(kind="bar")
plt.legend(["male", "female"])
# display(plt.show())
plt.show()



In [ ]:

	RespondentID	Have you seen any of the 6 films in the Star Wars franchise?	Do you consider yourself to be a fan of the Star Wars film franchise?	seen_1	seen_2	seen_3	seen_4	seen_5	seen_6	Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.	...	Unnamed: 28	Which character shot first?	Are you familiar with the Expanded Universe?	Do you consider yourself to be a fan of the Expanded Universe?��	Do you consider yourself to be a fan of the Star Trek franchise?	Gender	Age	Household Income	Education	Location (Census Region)
1	3.292880e+09	True	True	True	True	True	True	True	True	3	...	Very favorably	I don't understand this question	Yes	No	No	Male	18-29	NaN	High school degree	South Atlantic
2	3.292880e+09	False	NaN	False	False	False	False	False	False	NaN	...	NaN	NaN	NaN	NaN	Yes	Male	18-29	$0 - $24,999	Bachelor degree	West South Central
3	3.292765e+09	True	False	True	True	True	False	False	False	1	...	Unfamiliar (N/A)	I don't understand this question	No	NaN	No	Male	18-29	$0 - $24,999	High school degree	West North Central