In [18]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from pandas.plotting import scatter_matrix
%matplotlib inline

In [2]:
recent_grads = pd.read_csv('recent-grads.csv')
print(recent_grads.iloc[0])
print(recent_grads.head())
print(recent_grads.tail())
print(recent_grads.describe())

print(recent_grads[['Major','Total']])


Rank                                        1
Major_code                               2419
Major                   PETROLEUM ENGINEERING
Total                                    2339
Men                                      2057
Women                                     282
Major_category                    Engineering
ShareWomen                           0.120564
Sample_size                                36
Employed                                 1976
Full_time                                1849
Part_time                                 270
Full_time_year_round                     1207
Unemployed                                 37
Unemployment_rate                   0.0183805
Median                                 110000
P25th                                   95000
P75th                                  125000
College_jobs                             1534
Non_college_jobs                          364
Low_wage_jobs                             193
Name: 0, dtype: object
   Rank  Major_code                                      Major    Total  \
0     1        2419                      PETROLEUM ENGINEERING   2339.0   
1     2        2416             MINING AND MINERAL ENGINEERING    756.0   
2     3        2415                  METALLURGICAL ENGINEERING    856.0   
3     4        2417  NAVAL ARCHITECTURE AND MARINE ENGINEERING   1258.0   
4     5        2405                       CHEMICAL ENGINEERING  32260.0   

       Men    Women Major_category  ShareWomen  Sample_size  Employed  \
0   2057.0    282.0    Engineering    0.120564           36      1976   
1    679.0     77.0    Engineering    0.101852            7       640   
2    725.0    131.0    Engineering    0.153037            3       648   
3   1123.0    135.0    Engineering    0.107313           16       758   
4  21239.0  11021.0    Engineering    0.341631          289     25694   

       ...        Part_time  Full_time_year_round  Unemployed  \
0      ...              270                  1207          37   
1      ...              170                   388          85   
2      ...              133                   340          16   
3      ...              150                   692          40   
4      ...             5180                 16697        1672   

   Unemployment_rate  Median  P25th   P75th  College_jobs  Non_college_jobs  \
0           0.018381  110000  95000  125000          1534               364   
1           0.117241   75000  55000   90000           350               257   
2           0.024096   73000  50000  105000           456               176   
3           0.050125   70000  43000   80000           529               102   
4           0.061098   65000  50000   75000         18314              4440   

   Low_wage_jobs  
0            193  
1             50  
2              0  
3              0  
4            972  

[5 rows x 21 columns]
     Rank  Major_code                   Major   Total     Men   Women  \
168   169        3609                 ZOOLOGY  8409.0  3050.0  5359.0   
169   170        5201  EDUCATIONAL PSYCHOLOGY  2854.0   522.0  2332.0   
170   171        5202     CLINICAL PSYCHOLOGY  2838.0   568.0  2270.0   
171   172        5203   COUNSELING PSYCHOLOGY  4626.0   931.0  3695.0   
172   173        3501         LIBRARY SCIENCE  1098.0   134.0   964.0   

               Major_category  ShareWomen  Sample_size  Employed  \
168    Biology & Life Science    0.637293           47      6259   
169  Psychology & Social Work    0.817099            7      2125   
170  Psychology & Social Work    0.799859           13      2101   
171  Psychology & Social Work    0.798746           21      3777   
172                 Education    0.877960            2       742   

         ...        Part_time  Full_time_year_round  Unemployed  \
168      ...             2190                  3602         304   
169      ...              572                  1211         148   
170      ...              648                  1293         368   
171      ...              965                  2738         214   
172      ...              237                   410          87   

     Unemployment_rate  Median  P25th  P75th  College_jobs  Non_college_jobs  \
168           0.046320   26000  20000  39000          2771              2947   
169           0.065112   25000  24000  34000          1488               615   
170           0.149048   25000  25000  40000           986               870   
171           0.053621   23400  19200  26000          2403              1245   
172           0.104946   22000  20000  22000           288               338   

     Low_wage_jobs  
168            743  
169             82  
170            622  
171            308  
172            192  

[5 rows x 21 columns]
             Rank   Major_code          Total            Men          Women  \
count  173.000000   173.000000     172.000000     172.000000     172.000000   
mean    87.000000  3879.815029   39370.081395   16723.406977   22646.674419   
std     50.084928  1687.753140   63483.491009   28122.433474   41057.330740   
min      1.000000  1100.000000     124.000000     119.000000       0.000000   
25%     44.000000  2403.000000    4549.750000    2177.500000    1778.250000   
50%     87.000000  3608.000000   15104.000000    5434.000000    8386.500000   
75%    130.000000  5503.000000   38909.750000   14631.000000   22553.750000   
max    173.000000  6403.000000  393735.000000  173809.000000  307087.000000   

       ShareWomen  Sample_size       Employed      Full_time      Part_time  \
count  172.000000   173.000000     173.000000     173.000000     173.000000   
mean     0.522223   356.080925   31192.763006   26029.306358    8832.398844   
std      0.231205   618.361022   50675.002241   42869.655092   14648.179473   
min      0.000000     2.000000       0.000000     111.000000       0.000000   
25%      0.336026    39.000000    3608.000000    3154.000000    1030.000000   
50%      0.534024   130.000000   11797.000000   10048.000000    3299.000000   
75%      0.703299   338.000000   31433.000000   25147.000000    9948.000000   
max      0.968954  4212.000000  307933.000000  251540.000000  115172.000000   

       Full_time_year_round    Unemployed  Unemployment_rate         Median  \
count            173.000000    173.000000         173.000000     173.000000   
mean           19694.427746   2416.329480           0.068191   40151.445087   
std            33160.941514   4112.803148           0.030331   11470.181802   
min              111.000000      0.000000           0.000000   22000.000000   
25%             2453.000000    304.000000           0.050306   33000.000000   
50%             7413.000000    893.000000           0.067961   36000.000000   
75%            16891.000000   2393.000000           0.087557   45000.000000   
max           199897.000000  28169.000000           0.177226  110000.000000   

              P25th          P75th   College_jobs  Non_college_jobs  \
count    173.000000     173.000000     173.000000        173.000000   
mean   29501.445087   51494.219653   12322.635838      13284.497110   
std     9166.005235   14906.279740   21299.868863      23789.655363   
min    18500.000000   22000.000000       0.000000          0.000000   
25%    24000.000000   42000.000000    1675.000000       1591.000000   
50%    27000.000000   47000.000000    4390.000000       4595.000000   
75%    33000.000000   60000.000000   14444.000000      11783.000000   
max    95000.000000  125000.000000  151643.000000     148395.000000   

       Low_wage_jobs  
count     173.000000  
mean     3859.017341  
std      6944.998579  
min         0.000000  
25%       340.000000  
50%      1231.000000  
75%      3466.000000  
max     48207.000000  
                                             Major     Total
0                            PETROLEUM ENGINEERING    2339.0
1                   MINING AND MINERAL ENGINEERING     756.0
2                        METALLURGICAL ENGINEERING     856.0
3        NAVAL ARCHITECTURE AND MARINE ENGINEERING    1258.0
4                             CHEMICAL ENGINEERING   32260.0
5                              NUCLEAR ENGINEERING    2573.0
6                                ACTUARIAL SCIENCE    3777.0
7                       ASTRONOMY AND ASTROPHYSICS    1792.0
8                           MECHANICAL ENGINEERING   91227.0
9                           ELECTRICAL ENGINEERING   81527.0
10                            COMPUTER ENGINEERING   41542.0
11                           AEROSPACE ENGINEERING   15058.0
12                          BIOMEDICAL ENGINEERING   14955.0
13                               MATERIALS SCIENCE    4279.0
14       ENGINEERING MECHANICS PHYSICS AND SCIENCE    4321.0
15                          BIOLOGICAL ENGINEERING    8925.0
16        INDUSTRIAL AND MANUFACTURING ENGINEERING   18968.0
17                             GENERAL ENGINEERING   61152.0
18                       ARCHITECTURAL ENGINEERING    2825.0
19                                 COURT REPORTING    1148.0
20                                COMPUTER SCIENCE  128319.0
21                                    FOOD SCIENCE       NaN
22               ELECTRICAL ENGINEERING TECHNOLOGY   11565.0
23     MATERIALS ENGINEERING AND MATERIALS SCIENCE    2993.0
24   MANAGEMENT INFORMATION SYSTEMS AND STATISTICS   18713.0
25                               CIVIL ENGINEERING   53153.0
26                           CONSTRUCTION SERVICES   18498.0
27             OPERATIONS LOGISTICS AND E-COMMERCE   11732.0
28                       MISCELLANEOUS ENGINEERING    9133.0
29                                   PUBLIC POLICY    5978.0
..                                             ...       ...
143                     PLANT SCIENCE AND AGRONOMY    7416.0
144         SCIENCE AND COMPUTER TEACHER EDUCATION    6483.0
145                                     PSYCHOLOGY  393735.0
146                                          MUSIC   60633.0
147         PHYSICAL AND HEALTH EDUCATION TEACHING   28213.0
148                      ART HISTORY AND CRITICISM   21030.0
149                                      FINE ARTS   74440.0
150                   FAMILY AND CONSUMER SCIENCES   58001.0
151                                    SOCIAL WORK   53552.0
152                                ANIMAL SCIENCES   21573.0
153                     VISUAL AND PERFORMING ARTS   16250.0
154             TEACHER EDUCATION: MULTIPLE LEVELS   14443.0
155                       MISCELLANEOUS PSYCHOLOGY    9628.0
156      HUMAN SERVICES AND COMMUNITY ORGANIZATION    9374.0
157                                     HUMANITIES    6652.0
158               THEOLOGY AND RELIGIOUS VOCATIONS   30207.0
159                                    STUDIO ARTS   16977.0
160         COSMETOLOGY SERVICES AND CULINARY ARTS   10510.0
161                      MISCELLANEOUS AGRICULTURE    1488.0
162                    ANTHROPOLOGY AND ARCHEOLOGY   38844.0
163  COMMUNICATION DISORDERS SCIENCES AND SERVICES   38279.0
164                      EARLY CHILDHOOD EDUCATION   37589.0
165                        OTHER FOREIGN LANGUAGES   11204.0
166                         DRAMA AND THEATER ARTS   43249.0
167                       COMPOSITION AND RHETORIC   18953.0
168                                        ZOOLOGY    8409.0
169                         EDUCATIONAL PSYCHOLOGY    2854.0
170                            CLINICAL PSYCHOLOGY    2838.0
171                          COUNSELING PSYCHOLOGY    4626.0
172                                LIBRARY SCIENCE    1098.0

[173 rows x 2 columns]

In [3]:
#num of raw data rows
raw_data_cnt = len(recent_grads.values)
print(raw_data_cnt)

recent_grads.dropna(axis=0, how='any', inplace=True)
cleaned_data_count = len(recent_grads.values)
print(cleaned_data_count)

#we dropped one row(exactly one row contained NaN value)


173
172

In [4]:
#this is very good example of positive correlation
recent_grads.plot(x='Sample_size', y='Employed', kind='scatter')


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea90607f28>

In [5]:
#no correlation
recent_grads.plot(x='Sample_size', y='Median', kind='scatter', title='Median vs Sample size')


Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea8e5738d0>

In [6]:
#by observing plot in this cell we can say there is very weak negative correlation
#as full_time employee count grows the median salary tends to drop
P05_median = recent_grads['Median'].quantile(.05)
P85_median = recent_grads['Median'].quantile(.75)
print(P85_median)

recent_grads.plot(x='Full_time', y='Median', kind='scatter', title='Median vs Full time')


45000.0
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea8e4ee630>

In [7]:
fig = plt.figure(figsize=(8,8))
'''
take ranges for men/women count per major up to percentile of 85
i did this cuz it dramatically reduces range of x-axis and it excludes only 15% of the vector
reducing range makes plot 'zoomed' and easier to understand correlation in cases where majority of population is in
much lower range than highest 10-15%
'''
men_xlimrange = (0,recent_grads['Men'].quantile(.85))
women_xlimrange =(0,recent_grads['Women'].quantile(.85))


men_median = fig.add_subplot(2,1,1)
men_median.scatter(x=recent_grads['Men'], y=recent_grads['Median'])
men_median.set_xlim(men_xlimrange)
men_median.set_xlabel('Men count per major')
men_median.set_ylabel('Median of year salaries per major')
women_median = fig.add_subplot(2,1,2)
women_median.scatter(x=recent_grads['Women'], y=recent_grads['Median'])
women_median.set_xlabel('Women count per major')
women_median.set_ylabel('Median of year salaries per major')
women_median.set_xlim(women_xlimrange)


'''
by observing plots below i'd say that when it comes to majors where majority is women are making less money
but correlation between median and women is very very very weak(if exists at all) 
and tends to drop median while women count increases
still majors which have low count of women and men generate most money
majors which have low value of total people but generally more men produces more money
'''


Out[7]:
"\nby observing plots below i'd say that when it comes to majors where majority is women are making less money\nbut correlation between median and women is very very very weak(if exists at all) \nand tends to drop median while women count increases\nstill majors which have low count of women and men generate most money\nmajors which have low value of total people but generally more men produces more money\n"

In [63]:
'''
if we consider majors popularity by total column and say some major brings more money than other major 
if median of some major > median of other major then we can say by scatter plot below 
that there is very weak negative correlation between these two
conclusion is that there are few majors that have <10000 students and they have higher medians
so majors that have more students do not bring more money as well
also i limited x axis from 500 people to 60000 since this is were 80% of the majors fall and it is much easier
to observe the correlation

'''
recent_grads.plot(x='Total', y='Median', kind='scatter', xlim=(500,60000))

print(recent_grads['Total'].quantile([.1,.5,.75,.8,.9,1]))


0.10      1991.80
0.50     15104.00
0.75     38909.75
0.80     53087.20
0.90    124109.90
1.00    393735.00
Name: Total, dtype: float64

In [9]:
#lets find majors which have more femals than males
more_females = recent_grads['Women'] > recent_grads['Men']
more_females_df = recent_grads[more_females]
print(more_females_df[:1])
more_femals_money_median = more_females_df['Median']


#idk why i did this, although won't remove it


   Rank  Major_code                       Major   Total    Men  Women  \
7     8        5001  ASTRONOMY AND ASTROPHYSICS  1792.0  832.0  960.0   

      Major_category  ShareWomen  Sample_size  Employed      ...        \
7  Physical Sciences    0.535714           10      1526      ...         

   Part_time  Full_time_year_round  Unemployed  Unemployment_rate  Median  \
7        553                   827          33           0.021167   62000   

   P25th   P75th  College_jobs  Non_college_jobs  Low_wage_jobs  
7  31500  109000           972               500            220  

[1 rows x 21 columns]

In [10]:
n, bins, pathces = plt.hist(recent_grads['Median'], bins=7, edgecolor='black')
plt.xticks(np.arange(20000,120000,5000), rotation=90)
print(n)
print(sum(n))
print(bins)
#most common median salary range is 34000-47000 with count of 78


[ 60.  78.  20.  11.   2.   0.   1.]
172.0
[  22000.           34571.42857143   47142.85714286   59714.28571429
   72285.71428571   84857.14285714   97428.57142857  110000.        ]

In [11]:
recent_grads_ten = recent_grads[0:10].copy()
print(recent_grads_ten[['Total','Men','Women', 'ShareWomen']])
#90% of majors have more men in this small df
#10% of majors have more women in this small df

#lets plot this too see how histogram shows it
recent_grads_ten['ShareMen'] = recent_grads_ten['Men'] / recent_grads_ten['Total']
cols = ['ShareWomen', 'ShareMen']

fig = plt.figure(figsize=(10,7))

for i in range(len(cols)):
    ax = fig.add_subplot(2,1,i+1)
    ax.set_xlabel(cols[i])
    n, bins, patches = ax.hist(recent_grads_ten[cols[i]], edgecolor='black')
    print(n)
    print(bins)
'''
Well what we wrote above in comments seems like it is true after obesrving histograms
'''


     Total      Men    Women  ShareWomen
0   2339.0   2057.0    282.0    0.120564
1    756.0    679.0     77.0    0.101852
2    856.0    725.0    131.0    0.153037
3   1258.0   1123.0    135.0    0.107313
4  32260.0  21239.0  11021.0    0.341631
5   2573.0   2200.0    373.0    0.144967
6   3777.0   2110.0   1667.0    0.441356
7   1792.0    832.0    960.0    0.535714
8  91227.0  80320.0  10907.0    0.119559
9  81527.0  65511.0  16016.0    0.196450
[ 5.  1.  1.  0.  0.  1.  0.  1.  0.  1.]
[ 0.10185185  0.1452381   0.18862434  0.23201058  0.27539683  0.31878307
  0.36216931  0.40555556  0.4489418   0.49232804  0.53571429]
[ 1.  0.  1.  0.  1.  0.  0.  1.  1.  5.]
[ 0.46428571  0.50767196  0.5510582   0.59444444  0.63783069  0.68121693
  0.72460317  0.76798942  0.81137566  0.8547619   0.89814815]
Out[11]:
'\nWell what we wrote above in comments seems like it is true after obesrving histograms\n'

In [35]:
'''
OK we have found answers on 'What percent of majors are predominantly male? Predominantly female?' for first 10 rows
in data set, seen how to observe it and make conclusion from both visualizing and just watching the raw data
(we could watch the raw data itself cuz there was just 10 rows ofc)

Since we cannot tell the answer for the whole data-set by just observing raw-data we will plot it on histogram
and make a conclusion
'''
fig,ax = plt.subplots()
n , bins, patches = ax.hist(recent_grads['ShareWomen'], edgecolor='black')
print(n)
print(bins)

'''
by observing plot below i would say that there are about 42% majors that have more men
and 58% for women
Let's make some calculations to proove this
'''

more_men = recent_grads['Men'] > recent_grads['Women']
more_men_cnt = more_men.value_counts().loc[True]
more_men_majors_percentage = (more_men_cnt / len(recent_grads))*100
print(more_men_majors_percentage)

'''
I was accurate, real percentage for majors that men dominate is 44%, by just observing histogram 
along side with it' frequency list and bins list i came up with ~42% answer!


Let's fetch now percentage for majors where women dominate
'''

more_women_majors_percentage = 100-more_men_majors_percentage
print(more_women_majors_percentage)


[  3.  14.  16.  22.  19.  21.  25.  29.  11.  12.]
[ 0.          0.09689537  0.19379074  0.2906861   0.38758147  0.48447684
  0.58137221  0.67826758  0.77516295  0.87205831  0.96895368]
44.1860465116
55.8139534884

In [23]:
scatter_matrix(recent_grads[['Median','Sample_size']], figsize=(10,10))
'''
we can see as sample_size grows median usually tends to be lower
which means as bigger the sample is for a given major it's median year rounded salary tends to be lower
'''


Out[23]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fea87794438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fea87752630>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fea877619b0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fea87698780>]], dtype=object)

In [36]:
scatter_matrix(recent_grads[['Sample_size', 'Median', 'Unemployment_rate']], figsize=(15,10))


Out[36]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fea86e16438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fea86c4dfd0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fea87036f60>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fea86b95940>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fea86b75ac8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fea86b75b00>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fea86a580f0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fea86a154e0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fea8699fa20>]], dtype=object)

In [71]:
'''
df is already sorted by rank.
Rank displays ranking for the major based on it's median earnings
'''
recent_grads[0:10].plot.bar(x=['Major','Median'], y='ShareWomen')


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea84a00d68>

In [70]:
recent_grads[-10:].plot.bar(x=['Major','Median'], y='ShareWomen')


Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea84bb7cc0>

In [69]:
'''
    By observing two plots above we can say that there is low percentage of women in highest paid majors
    and high percentage of women in lowest paid majors
    So i was curious then to see the correlation between 
    Median( represents median for the year rounded salary in the major) and
    ShareWomen ( represents percentage of the women in the major)


    By observing plot below we can see that there is strong negative correlation between the two
    As the share of women in major drops median tends to be higher
'''
recent_grads.plot(x='Median', y='ShareWomen', kind='scatter')


Out[69]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea84c784a8>

In [67]:
recent_grads[:10].plot.bar(x=['Major', 'Median'], y='Unemployment_rate')


Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fea84d06390>

In [75]:
recent_grads[-10:].plot.bar(x=['Major','Median'], y='Unemployment_rate')


Out[75]:
' \nBy observing two plots above we can say that generally unemp\nrate is higher in the most poor  majors than in the most paid majors\n '

In [76]:
''' 
By observing two plots above we can say that generally unemp
rate is higher in the most poor  majors than in the most paid majors
 '''


Out[76]:
' \nBy observing two plots above we can say that generally unemp\nrate is higher in the most poor  majors than in the most paid majors\n '