In [18]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from pandas.plotting import scatter_matrix
%matplotlib inline
In [2]:
recent_grads = pd.read_csv('recent-grads.csv')
print(recent_grads.iloc[0])
print(recent_grads.head())
print(recent_grads.tail())
print(recent_grads.describe())
print(recent_grads[['Major','Total']])
In [3]:
#num of raw data rows
raw_data_cnt = len(recent_grads.values)
print(raw_data_cnt)
recent_grads.dropna(axis=0, how='any', inplace=True)
cleaned_data_count = len(recent_grads.values)
print(cleaned_data_count)
#we dropped one row(exactly one row contained NaN value)
In [4]:
#this is very good example of positive correlation
recent_grads.plot(x='Sample_size', y='Employed', kind='scatter')
Out[4]:
In [5]:
#no correlation
recent_grads.plot(x='Sample_size', y='Median', kind='scatter', title='Median vs Sample size')
Out[5]:
In [6]:
#by observing plot in this cell we can say there is very weak negative correlation
#as full_time employee count grows the median salary tends to drop
P05_median = recent_grads['Median'].quantile(.05)
P85_median = recent_grads['Median'].quantile(.75)
print(P85_median)
recent_grads.plot(x='Full_time', y='Median', kind='scatter', title='Median vs Full time')
Out[6]:
In [7]:
fig = plt.figure(figsize=(8,8))
'''
take ranges for men/women count per major up to percentile of 85
i did this cuz it dramatically reduces range of x-axis and it excludes only 15% of the vector
reducing range makes plot 'zoomed' and easier to understand correlation in cases where majority of population is in
much lower range than highest 10-15%
'''
men_xlimrange = (0,recent_grads['Men'].quantile(.85))
women_xlimrange =(0,recent_grads['Women'].quantile(.85))
men_median = fig.add_subplot(2,1,1)
men_median.scatter(x=recent_grads['Men'], y=recent_grads['Median'])
men_median.set_xlim(men_xlimrange)
men_median.set_xlabel('Men count per major')
men_median.set_ylabel('Median of year salaries per major')
women_median = fig.add_subplot(2,1,2)
women_median.scatter(x=recent_grads['Women'], y=recent_grads['Median'])
women_median.set_xlabel('Women count per major')
women_median.set_ylabel('Median of year salaries per major')
women_median.set_xlim(women_xlimrange)
'''
by observing plots below i'd say that when it comes to majors where majority is women are making less money
but correlation between median and women is very very very weak(if exists at all)
and tends to drop median while women count increases
still majors which have low count of women and men generate most money
majors which have low value of total people but generally more men produces more money
'''
Out[7]:
In [63]:
'''
if we consider majors popularity by total column and say some major brings more money than other major
if median of some major > median of other major then we can say by scatter plot below
that there is very weak negative correlation between these two
conclusion is that there are few majors that have <10000 students and they have higher medians
so majors that have more students do not bring more money as well
also i limited x axis from 500 people to 60000 since this is were 80% of the majors fall and it is much easier
to observe the correlation
'''
recent_grads.plot(x='Total', y='Median', kind='scatter', xlim=(500,60000))
print(recent_grads['Total'].quantile([.1,.5,.75,.8,.9,1]))
In [9]:
#lets find majors which have more femals than males
more_females = recent_grads['Women'] > recent_grads['Men']
more_females_df = recent_grads[more_females]
print(more_females_df[:1])
more_femals_money_median = more_females_df['Median']
#idk why i did this, although won't remove it
In [10]:
n, bins, pathces = plt.hist(recent_grads['Median'], bins=7, edgecolor='black')
plt.xticks(np.arange(20000,120000,5000), rotation=90)
print(n)
print(sum(n))
print(bins)
#most common median salary range is 34000-47000 with count of 78
In [11]:
recent_grads_ten = recent_grads[0:10].copy()
print(recent_grads_ten[['Total','Men','Women', 'ShareWomen']])
#90% of majors have more men in this small df
#10% of majors have more women in this small df
#lets plot this too see how histogram shows it
recent_grads_ten['ShareMen'] = recent_grads_ten['Men'] / recent_grads_ten['Total']
cols = ['ShareWomen', 'ShareMen']
fig = plt.figure(figsize=(10,7))
for i in range(len(cols)):
ax = fig.add_subplot(2,1,i+1)
ax.set_xlabel(cols[i])
n, bins, patches = ax.hist(recent_grads_ten[cols[i]], edgecolor='black')
print(n)
print(bins)
'''
Well what we wrote above in comments seems like it is true after obesrving histograms
'''
Out[11]:
In [35]:
'''
OK we have found answers on 'What percent of majors are predominantly male? Predominantly female?' for first 10 rows
in data set, seen how to observe it and make conclusion from both visualizing and just watching the raw data
(we could watch the raw data itself cuz there was just 10 rows ofc)
Since we cannot tell the answer for the whole data-set by just observing raw-data we will plot it on histogram
and make a conclusion
'''
fig,ax = plt.subplots()
n , bins, patches = ax.hist(recent_grads['ShareWomen'], edgecolor='black')
print(n)
print(bins)
'''
by observing plot below i would say that there are about 42% majors that have more men
and 58% for women
Let's make some calculations to proove this
'''
more_men = recent_grads['Men'] > recent_grads['Women']
more_men_cnt = more_men.value_counts().loc[True]
more_men_majors_percentage = (more_men_cnt / len(recent_grads))*100
print(more_men_majors_percentage)
'''
I was accurate, real percentage for majors that men dominate is 44%, by just observing histogram
along side with it' frequency list and bins list i came up with ~42% answer!
Let's fetch now percentage for majors where women dominate
'''
more_women_majors_percentage = 100-more_men_majors_percentage
print(more_women_majors_percentage)
In [23]:
scatter_matrix(recent_grads[['Median','Sample_size']], figsize=(10,10))
'''
we can see as sample_size grows median usually tends to be lower
which means as bigger the sample is for a given major it's median year rounded salary tends to be lower
'''
Out[23]:
In [36]:
scatter_matrix(recent_grads[['Sample_size', 'Median', 'Unemployment_rate']], figsize=(15,10))
Out[36]:
In [71]:
'''
df is already sorted by rank.
Rank displays ranking for the major based on it's median earnings
'''
recent_grads[0:10].plot.bar(x=['Major','Median'], y='ShareWomen')
Out[71]:
In [70]:
recent_grads[-10:].plot.bar(x=['Major','Median'], y='ShareWomen')
Out[70]:
In [69]:
'''
By observing two plots above we can say that there is low percentage of women in highest paid majors
and high percentage of women in lowest paid majors
So i was curious then to see the correlation between
Median( represents median for the year rounded salary in the major) and
ShareWomen ( represents percentage of the women in the major)
By observing plot below we can see that there is strong negative correlation between the two
As the share of women in major drops median tends to be higher
'''
recent_grads.plot(x='Median', y='ShareWomen', kind='scatter')
Out[69]:
In [67]:
recent_grads[:10].plot.bar(x=['Major', 'Median'], y='Unemployment_rate')
Out[67]:
In [75]:
recent_grads[-10:].plot.bar(x=['Major','Median'], y='Unemployment_rate')
Out[75]:
In [76]:
'''
By observing two plots above we can say that generally unemp
rate is higher in the most poor majors than in the most paid majors
'''
Out[76]: