notebook.community

Edit and run



In [1]:

    
import warnings

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize

pd.set_option('display.max_columns', 1800)
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

%matplotlib inline



In [2]:

    
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
          'figure.figsize' : (12, 10),
          'axes.axisbelow' : True,
          'lines.antialiased' : True,
          'axes.titlesize' : 'xx-large',
          'axes.labelsize' : 'x-large',
          'xtick.labelsize' : 'large',
          'ytick.labelsize' : 'large'}

for (k, v) in params.items():
    plt.rcParams[k] = v



In [3]:

    
data = pd.DataFrame()
files = ['data/MERGED2003_PP.csv', 'data/MERGED2005_PP.csv',
         'data/MERGED2007_PP.csv']
dfs = []
for file in files:
    df = pd.read_csv(file, low_memory=False)
    print(df.shape)
    df['type'] = 'training'
    dfs.append(df)

data = pd.concat(dfs)









    



(6585, 1729)
(6824, 1729)
(6890, 1729)



In [4]:

    
testing = pd.read_csv('data/MERGED2011_PP.csv', low_memory=False)
print(testing.shape)
testing['type'] = 'testing'









    



(7675, 1729)



In [5]:

    
data = pd.concat([data, testing])



In [6]:

    
cols_target = ['md_earn_wne_p6']

cols_school = ['PREDDEG', 'HIGHDEG', 'CONTROL', 'NUMBRANCH', 'AVGFACSAL']
#                'PCIP01', 'PCIP03', 'PCIP04', 'PCIP05', 'PCIP09', 'PCIP10',
#                'PCIP11', 'PCIP12', 'PCIP13', 'PCIP14', 'PCIP15', 'PCIP16',
#                'PCIP19', 'PCIP22', 'PCIP23', 'PCIP24', 'PCIP25', 'PCIP26',
#                'PCIP27', 'PCIP29', 'PCIP30', 'PCIP31', 'PCIP38', 'PCIP39',
#                'PCIP40', 'PCIP41', 'PCIP42', 'PCIP43', 'PCIP44', 'PCIP45',
#                'PCIP46', 'PCIP47', 'PCIP48', 'PCIP49', 'PCIP50', 'PCIP51',
#                'PCIP52', 'PCIP54']

# cols_admissions = ['ADM_RATE', 'SATVR25', 'SATVR75', 'SATMT25', 'SATMT75', 'SAT_AVG']

cols_costs = ['TUITFTE']

cols_studentbody = ['UGDS', 'UGDS_NRA', 'PPTUG_EF', 'UG25abv',
                    'PAR_ED_PCT_1STGEN', 'DEP_INC_AVG', 'IND_INC_AVG',
                    'COMP_ORIG_YR2_RT', 'WDRAW_ORIG_YR2_RT', 'ENRL_ORIG_YR2_RT',
                    'COMP_ORIG_YR4_RT', 'WDRAW_ORIG_YR4_RT', 'ENRL_ORIG_YR4_RT',
                    'OVERALL_YR2_N', 'OVERALL_YR3_N', 'OVERALL_YR4_N',
                    'OVERALL_YR6_N', 'OVERALL_YR8_N', 'count_nwne_p6']

cols_financialaid = ['DEBT_MDN', 'GRAD_DEBT_MDN', 'WDRAW_DEBT_MDN']

cols_other = ['type']



In [7]:

    
data.shape









    Out[7]:





(27974, 1730)



In [8]:

    
data_reduced = data[cols_target+cols_school+cols_costs+\
                    cols_studentbody+cols_financialaid+cols_other]



In [9]:

    
for c in data_reduced.columns:
    if (data_reduced[c].dtype == object) and (c != 'type'):
        data_reduced[c] = data_reduced[c].apply(lambda x: float(x) if x != 'PrivacySuppressed' else np.nan)
        data_reduced[c] = data_reduced[c].astype(float)



In [10]:

    
data_reduced.describe()









    Out[10]:






  
    
      
      md_earn_wne_p6
      PREDDEG
      HIGHDEG
      CONTROL
      NUMBRANCH
      AVGFACSAL
      TUITFTE
      UGDS
      UGDS_NRA
      PPTUG_EF
      UG25abv
      PAR_ED_PCT_1STGEN
      DEP_INC_AVG
      IND_INC_AVG
      COMP_ORIG_YR2_RT
      WDRAW_ORIG_YR2_RT
      ENRL_ORIG_YR2_RT
      COMP_ORIG_YR4_RT
      WDRAW_ORIG_YR4_RT
      ENRL_ORIG_YR4_RT
      OVERALL_YR2_N
      OVERALL_YR3_N
      OVERALL_YR4_N
      OVERALL_YR6_N
      OVERALL_YR8_N
      count_nwne_p6
      DEBT_MDN
      GRAD_DEBT_MDN
      WDRAW_DEBT_MDN
    
  
  
    
      count
      22394.000000
      27974.000000
      27974.000000
      27974.000000
      27974.000000
      17094.000000
      26958.000000
      26084.000000
      26084.000000
      25961.000000
      25830.000000
      24461.000000
      25166.000000
      25303.000000
      22303.000000
      22473.000000
      20116.000000
      22184.000000
      21849.000000
      19316.000000
      26230.000000
      25815.000000
      25535.000000
      24948.000000
      18614.000000
      24423.000000
      22847.000000
      21627.000000
      21315.000000
    
    
      mean
      29062.007681
      1.857582
      2.213877
      2.118932
      4.037070
      5348.674798
      11011.289970
      2222.314944
      0.014165
      0.223849
      0.413015
      0.493549
      52556.296987
      22942.376603
      0.278013
      0.205554
      0.238424
      0.374718
      0.205404
      0.040188
      2614.042242
      2266.867209
      1965.423497
      1385.139370
      1334.820297
      249.651435
      8994.602180
      12483.039742
      6405.177668
    
    
      std
      10822.399280
      1.008607
      1.274660
      0.841436
      12.977136
      1935.658093
      172001.637782
      4855.392337
      0.043849
      0.240339
      0.220127
      0.135478
      24402.757134
      9926.116828
      0.254066
      0.135533
      0.195519
      0.243561
      0.144668
      0.050813
      14366.819640
      13225.712340
      10592.497586
      5225.296605
      5179.278829
      891.605076
      4821.270423
      7030.661573
      3447.324241
    
    
      min
      7000.000000
      0.000000
      0.000000
      1.000000
      1.000000
      42.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000400
      0.000000
      443.174960
      215.042694
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      10.000000
      10.000000
      10.000000
      10.000000
      10.000000
      0.000000
      124.000000
      654.000000
      520.000000
    
    
      25%
      22000.000000
      1.000000
      1.000000
      1.000000
      1.000000
      4088.250000
      2537.750000
      127.000000
      0.000000
      0.000000
      0.253025
      0.413534
      33732.107442
      16257.267275
      0.062918
      0.106195
      0.060261
      0.155203
      0.101010
      0.000000
      119.000000
      116.000000
      114.000000
      107.000000
      111.000000
      23.000000
      5400.000000
      6625.000000
      3750.000000
    
    
      50%
      27500.000000
      2.000000
      2.000000
      2.000000
      1.000000
      5160.000000
      5909.000000
      522.000000
      0.000000
      0.145700
      0.411000
      0.510294
      49126.867425
      21397.270020
      0.160684
      0.198300
      0.204279
      0.378328
      0.195652
      0.025147
      450.000000
      420.000000
      407.000000
      373.000000
      369.000000
      66.000000
      7905.000000
      11119.000000
      5500.000000
    
    
      75%
      34400.000000
      3.000000
      4.000000
      3.000000
      2.000000
      6376.000000
      10877.750000
      2067.000000
      0.011200
      0.386400
      0.563400
      0.581560
      68785.735878
      27541.043335
      0.514689
      0.293532
      0.391884
      0.571936
      0.295954
      0.066380
      1350.000000
      1230.000000
      1139.500000
      994.000000
      968.000000
      186.000000
      12000.000000
      17125.000000
      8250.000000
    
    
      max
      133600.000000
      4.000000
      4.000000
      3.000000
      128.000000
      24699.000000
      26670163.000000
      249604.000000
      1.000000
      1.000000
      1.000000
      1.000000
      181008.007100
      79375.209910
      1.000000
      0.781513
      1.000000
      1.000000
      0.842161
      0.694737
      237888.000000
      222715.000000
      170316.000000
      72057.000000
      70824.000000
      12567.000000
      95984.000000
      47186.500000
      33125.000000



In [11]:

    
data_reduced.dropna(inplace=True)



In [12]:

    
plt.figure(figsize=(12, 12))

sns.regplot(x=data_reduced.PAR_ED_PCT_1STGEN, y=data_reduced.md_earn_wne_p6, color='#348ABD');

plt.title('Earnings and Percentage of First-Generation Students')
plt.xlabel('Percentage of First-Generation Students')
plt.ylabel('Median Earnings Six Years After Entry')

plt.ylim(0, 120000);

plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)



In [13]:

    
plt.figure(figsize=(10, 12))

sns.boxplot(x="CONTROL", y="md_earn_wne_p6", data=data_reduced,
            order=np.sort(data_reduced.CONTROL.unique()), color='White',
            fliersize=0, width=0.25)
sns.stripplot(x="CONTROL", y="md_earn_wne_p6", data=data_reduced,
              order=np.sort(data_reduced.CONTROL.unique()),
              alpha=0.25, size=5,
              color='#348ABD', edgecolor='#348ABD')

plt.title('Distribution of Earnings by Institution Type')
plt.xlabel('Institution Type')
plt.ylabel('')

plt.xticks(range(3), ['Public', 'Private Nonprofit', 'Private For-profit'])

plt.ylim(0, 120000);

plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)



In [14]:

    
plt.figure(figsize=(12, 12))

sns.regplot(x=data_reduced.AVGFACSAL, y=data_reduced.md_earn_wne_p6, color='#348ABD');

plt.title('Earnings and Average Faculty Salary')
plt.xlabel('Average Faculty Salary')
plt.ylabel('Median Earnings Six Years After Entry')

plt.xlim(0, 20000);
plt.ylim(0, 120000);

plt.gca().get_xaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ','))
)
plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)



In [15]:

    
plt.figure(figsize=(10, 12))

sns.boxplot(x="PREDDEG", y="md_earn_wne_p6", data=data_reduced,
            order=np.sort(data_reduced.PREDDEG.unique()), color='White',
            fliersize=0, width=0.25)
sns.stripplot(x="PREDDEG", y="md_earn_wne_p6", data=data_reduced,
              order=np.sort(data_reduced.PREDDEG.unique()),
              alpha=0.25, size=5,
              color='#348ABD', edgecolor='#348ABD')

plt.title('Distribution of Earnings by Predominant Degree Type')
plt.xlabel('Degree Type')
plt.ylabel('')

plt.xticks(range(4), ['Certificate', 'Associates', 'Bachelor\'s', 'Graduate'])

plt.ylim(0, 120000);

plt.gca().get_yaxis().set_major_formatter(
    mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)



In [ ]:

	md_earn_wne_p6	PREDDEG	HIGHDEG	CONTROL	NUMBRANCH	AVGFACSAL	TUITFTE	UGDS	UGDS_NRA	PPTUG_EF	UG25abv	PAR_ED_PCT_1STGEN	DEP_INC_AVG	IND_INC_AVG	COMP_ORIG_YR2_RT	WDRAW_ORIG_YR2_RT	ENRL_ORIG_YR2_RT	COMP_ORIG_YR4_RT	WDRAW_ORIG_YR4_RT	ENRL_ORIG_YR4_RT	OVERALL_YR2_N	OVERALL_YR3_N	OVERALL_YR4_N	OVERALL_YR6_N	OVERALL_YR8_N	count_nwne_p6	DEBT_MDN	GRAD_DEBT_MDN	WDRAW_DEBT_MDN
count	22394.000000	27974.000000	27974.000000	27974.000000	27974.000000	17094.000000	26958.000000	26084.000000	26084.000000	25961.000000	25830.000000	24461.000000	25166.000000	25303.000000	22303.000000	22473.000000	20116.000000	22184.000000	21849.000000	19316.000000	26230.000000	25815.000000	25535.000000	24948.000000	18614.000000	24423.000000	22847.000000	21627.000000	21315.000000
mean	29062.007681	1.857582	2.213877	2.118932	4.037070	5348.674798	11011.289970	2222.314944	0.014165	0.223849	0.413015	0.493549	52556.296987	22942.376603	0.278013	0.205554	0.238424	0.374718	0.205404	0.040188	2614.042242	2266.867209	1965.423497	1385.139370	1334.820297	249.651435	8994.602180	12483.039742	6405.177668
std	10822.399280	1.008607	1.274660	0.841436	12.977136	1935.658093	172001.637782	4855.392337	0.043849	0.240339	0.220127	0.135478	24402.757134	9926.116828	0.254066	0.135533	0.195519	0.243561	0.144668	0.050813	14366.819640	13225.712340	10592.497586	5225.296605	5179.278829	891.605076	4821.270423	7030.661573	3447.324241
min	7000.000000	0.000000	0.000000	1.000000	1.000000	42.000000	0.000000	0.000000	0.000000	0.000000	0.000400	0.000000	443.174960	215.042694	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	10.000000	10.000000	10.000000	10.000000	10.000000	0.000000	124.000000	654.000000	520.000000
25%	22000.000000	1.000000	1.000000	1.000000	1.000000	4088.250000	2537.750000	127.000000	0.000000	0.000000	0.253025	0.413534	33732.107442	16257.267275	0.062918	0.106195	0.060261	0.155203	0.101010	0.000000	119.000000	116.000000	114.000000	107.000000	111.000000	23.000000	5400.000000	6625.000000	3750.000000
50%	27500.000000	2.000000	2.000000	2.000000	1.000000	5160.000000	5909.000000	522.000000	0.000000	0.145700	0.411000	0.510294	49126.867425	21397.270020	0.160684	0.198300	0.204279	0.378328	0.195652	0.025147	450.000000	420.000000	407.000000	373.000000	369.000000	66.000000	7905.000000	11119.000000	5500.000000
75%	34400.000000	3.000000	4.000000	3.000000	2.000000	6376.000000	10877.750000	2067.000000	0.011200	0.386400	0.563400	0.581560	68785.735878	27541.043335	0.514689	0.293532	0.391884	0.571936	0.295954	0.066380	1350.000000	1230.000000	1139.500000	994.000000	968.000000	186.000000	12000.000000	17125.000000	8250.000000
max	133600.000000	4.000000	4.000000	3.000000	128.000000	24699.000000	26670163.000000	249604.000000	1.000000	1.000000	1.000000	1.000000	181008.007100	79375.209910	1.000000	0.781513	1.000000	1.000000	0.842161	0.694737	237888.000000	222715.000000	170316.000000	72057.000000	70824.000000	12567.000000	95984.000000	47186.500000	33125.000000