In [30]:
# %sh
# wget https://raw.githubusercontent.com/fivethirtyeight/data/master/star-wars-survey/StarWars.csv
# mv StarWars.csv star_wars.csv
In [31]:
import sys
reload(sys)
sys.setdefaultencoding('utf8')
sys.getdefaultencoding()
In [32]:
import pandas as pd
star_wars = pd.read_csv("star_wars.csv")
print(star_wars.shape)
In [33]:
print(star_wars.head(3))
In [34]:
print(star_wars.columns)
In [35]:
star_wars = star_wars[pd.notnull(star_wars["RespondentID"])]
print(star_wars.shape)
In [36]:
for column in list(star_wars):
# print("{0}:".format(column))
print(star_wars[column].value_counts(dropna=False))
In [37]:
import numpy as np
yes_no = {
"Yes": True,
"No": False,
True: True,
False: False,
}
seen_columns = {
"Which of the following Star Wars films have you seen? Please select all that apply.": "seen_1",
"Unnamed: 4": "seen_2",
"Unnamed: 5": "seen_3",
"Unnamed: 6": "seen_4",
"Unnamed: 7": "seen_5",
"Unnamed: 8": "seen_6"
}
seen_movies = {
"Star Wars: Episode I The Phantom Menace": True,
"Star Wars: Episode II Attack of the Clones": True,
"Star Wars: Episode III Revenge of the Sith": True,
"Star Wars: Episode IV A New Hope": True,
"Star Wars: Episode V The Empire Strikes Back": True,
"Star Wars: Episode VI Return of the Jedi": True,
np.nan: False,
}
# star_wars["Have you seen any of the 6 films in the Star Wars franchise?"] = star_wars["Have you seen any of the 6 films in the Star Wars franchise?"].map(yes_no, na_action="ignore")
# star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"] = star_wars["Do you consider yourself to be a fan of the Star Wars film franchise?"].map(yes_no, na_action="ignore")
def clean_columns(df, columns, mapping):
for column in columns:
df[column] = df[column].map(mapping, na_action="ignore")
print(column)
print("{0}\n".format(df[column].value_counts(dropna=False)))
clean_columns(star_wars, star_wars.columns[1:3], yes_no)
star_wars.rename(columns=seen_columns, inplace=True)
clean_columns(star_wars, star_wars.columns[3:9], dict(seen_movies.items() + yes_no.items()))
star_wars.head(3)
Out[37]:
In [38]:
star_wars.iloc[:,9:15] = star_wars.iloc[:,9:15].astype(float)
ranking_columns = {
"Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.": "ranking_1",
"Unnamed: 10": "ranking_2",
"Unnamed: 11": "ranking_3",
"Unnamed: 12": "ranking_4",
"Unnamed: 13": "ranking_5",
"Unnamed: 14": "ranking_6",
}
star_wars.rename(columns=ranking_columns, inplace=True)
star_wars.head(3)
Out[38]:
In [60]:
# Find the ranking of the movies
import matplotlib.pyplot as plt
%matplotlib inline
# Get movie names
movie_names = filter(lambda x: x is not np.nan, list(seen_movies.keys()))
movie_names.sort()
ranking = star_wars.iloc[:,9:15].mean()
ranking.index = movie_names
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ranking.plot(kind="bar")
# display(fig)
fig.show()
In [64]:
# Find the most seen movies
seen = star_wars.iloc[:,3:9].sum()
seen.index = movie_names
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
seen.plot(kind="bar")
# display(fig)
fig.show()
In [71]:
males = star_wars[star_wars["Gender"] == "Male"]
females = star_wars[star_wars["Gender"] == "Female"]
male_ranking = males.iloc[:,9:15].mean()
female_ranking = females.iloc[:,9:15].mean()
total_ranking = pd.concat([male_ranking, female_ranking], axis=1)
total_ranking.index = movie_names
total_ranking.plot(kind="bar")
plt.legend(["male", "female"])
# display(plt.show())
plt.show()
In [ ]: