In [11]:
# %sh
# wget https://raw.githubusercontent.com/fivethirtyeight/data/master/college-majors/recent-grads.csv
In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
recent_grads = pd.read_csv("recent-grads.csv")
recent_grads.head(3)
Out[12]:
In [13]:
recent_grads.tail(3)
Out[13]:
In [14]:
recent_grads.describe()
Out[14]:
In [15]:
# Remove missing values
print("Original: {0} rows x {1} columns".format(recent_grads.shape[0], recent_grads.shape[1]))
recent_grads.dropna(inplace=True)
print("After cleansed: {0} rows x {1} columns".format(recent_grads.shape[0], recent_grads.shape[1]))
In [16]:
# Plot scatter matrix
from pandas.tools.plotting import scatter_matrix
scatter_matrix(recent_grads[["ShareWomen","Unemployment_rate"]])
plt.show()
In [17]:
# Plot grouped bar plot
# recent_grads["ShareMen"] = recent_grads["Men"] / recent_grads["Total"]
recent_grads["ShareMen"] = 1 - recent_grads["ShareWomen"]
arts = recent_grads[recent_grads["Major_category"] == "Arts"]
arts.set_index("Major", inplace=True)
sex_ratio = ["ShareMen", "ShareWomen"]
recent_grads.head(10)
yticks = np.arange(0, 1.4, 0.2)
arts[sex_ratio].plot(kind="bar", stacked=True, figsize=(6,6), yticks=yticks)
plt.show()
In [ ]: