In [24]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [25]:
recent_grads = pd.read_csv('recent-grads.csv')
recent_grads.iloc[0]
Out[25]:
In [26]:
recent_grads.head()
Out[26]:
In [27]:
recent_grads.tail()
Out[27]:
In [28]:
recent_grads.describe()
Out[28]:
In [29]:
raw_data_count = 173
recent_grads = recent_grads.dropna()
recent_grads.describe()
Out[29]:
In [30]:
recent_grads.plot(x='Sample_size', y='Median', kind='scatter')
Out[30]:
In [31]:
recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='scatter')
Out[31]:
In [32]:
recent_grads.plot(x='Full_time', y='Median', kind='scatter')
Out[32]:
In [33]:
recent_grads.plot(x='ShareWomen', y='Unemployment_rate', kind='scatter')
Out[33]:
In [34]:
recent_grads.plot(x='Men', y='Median', kind='scatter')
Out[34]:
In [35]:
recent_grads.plot(x='Women', y='Median', kind='scatter')
Out[35]:
In [36]:
##Do students in more popular majors make more money?
In [37]:
recent_grads.plot(x='Total', y='Median', kind='scatter')
Out[37]:
In [38]:
## As we can see, answer is no. There is a weak trend: the more popularity - the less the median salary
In [39]:
##Do students that majored in subjects that were majority female make more money?
In [40]:
recent_grads.plot(x='ShareWomen', y='Median', kind='scatter')
Out[40]:
In [41]:
##No, they dont. We can notice a trending down: median is down whe women percent is up
In [42]:
##Is there any link between the number of full-time employees and median salary?
In [43]:
recent_grads.plot(x='Full_time', y='Median', kind='scatter')
Out[43]:
In [44]:
## No, we have found no link between full-time and salary
In [45]:
cols = [Sample_size, Median, Employed, Full_time, ShareWomen, Unemployment_rate, Men, Women]
In [ ]:
recent_grads['Sample_size'].hist(bins=50, range=(0,1000))
In [ ]:
recent_grads['Median'].hist(bins=20, xrot=90, figsize=(4,10))
In [ ]:
recent_grads['Employed'].hist(bins=20, xrot=90)
In [ ]:
recent_grads['Full_time'].hist(bins=20, xrot=90)
In [ ]:
recent_grads['ShareWomen'].hist(bins=15, xrot=90)
In [ ]:
recent_grads['Unemployment_rate'].hist(bins=15, xrot=90)
In [ ]:
recent_grads['Men'].hist(bins=10, xrot=90)
In [ ]:
recent_grads['Women'].hist(bins=10, xrot=90)
In [ ]:
from pandas.tools.plotting import scatter_matrix
scatter_matrix(recent_grads[['Sample_size', 'Median']], figsize=(10,10))
In [ ]:
scatter_matrix(recent_grads[['Sample_size', 'Median', 'Unemployment_rate']], figsize=(10,10))
In [ ]:
recent_grads[:10].plot.bar(x='Major', y='ShareWomen', legend=False, title='Top 10 salary and percent of women')
recent_grads[len(recent_grads)-10:].plot.bar(x='Major', y='ShareWomen', legend=False, title='Low 10 salary and percent of women')
In [ ]:
recent_grads[:10].plot.bar(x='Major', y='Unemployment_rate', legend=False, title='Unemployment rate in top 10')
recent_grads[(len(recent_grads)-10):].plot.bar(x='Major', y='Unemployment_rate', legend=False, title='Unemployment rate in end 10')
In [ ]:
## Use a grouped bar plot to compare the number of men with the number of women in each category of majors
recent_grads[['Men', 'Women']].plot.barh(figsize=(15,100), stacked=True)
In [49]:
## Use a box plot to explore the distributions of median salaries and unemployment rate.
recent_grads['Median'].plot.box()
Out[49]:
In [58]:
recent_grads['Unemployment_rate'].plot.box(vert=False)
Out[58]:
In [59]:
## Use a hexagonal bin plot to visualize the columns that had dense scatter plots from earlier in the project
recent_grads.plot(x='Sample_size', y='Unemployment_rate', kind='hexbin')
Out[59]:
In [ ]: