In [1]:
import warnings
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
pd.set_option('display.max_columns', 1800)
plt.style.use('ggplot')
warnings.filterwarnings('ignore')
%matplotlib inline
In [2]:
mpl.rc('savefig', dpi=200)
params = {'figure.dpi' : 200,
'figure.figsize' : (12, 10),
'axes.axisbelow' : True,
'lines.antialiased' : True,
'axes.titlesize' : 'xx-large',
'axes.labelsize' : 'x-large',
'xtick.labelsize' : 'large',
'ytick.labelsize' : 'large'}
for (k, v) in params.items():
plt.rcParams[k] = v
In [3]:
data = pd.DataFrame()
files = ['data/MERGED2003_PP.csv', 'data/MERGED2005_PP.csv',
'data/MERGED2007_PP.csv']
dfs = []
for file in files:
df = pd.read_csv(file, low_memory=False)
print(df.shape)
df['type'] = 'training'
dfs.append(df)
data = pd.concat(dfs)
In [4]:
testing = pd.read_csv('data/MERGED2011_PP.csv', low_memory=False)
print(testing.shape)
testing['type'] = 'testing'
In [5]:
data = pd.concat([data, testing])
In [6]:
cols_target = ['md_earn_wne_p6']
cols_school = ['PREDDEG', 'HIGHDEG', 'CONTROL', 'NUMBRANCH', 'AVGFACSAL']
# 'PCIP01', 'PCIP03', 'PCIP04', 'PCIP05', 'PCIP09', 'PCIP10',
# 'PCIP11', 'PCIP12', 'PCIP13', 'PCIP14', 'PCIP15', 'PCIP16',
# 'PCIP19', 'PCIP22', 'PCIP23', 'PCIP24', 'PCIP25', 'PCIP26',
# 'PCIP27', 'PCIP29', 'PCIP30', 'PCIP31', 'PCIP38', 'PCIP39',
# 'PCIP40', 'PCIP41', 'PCIP42', 'PCIP43', 'PCIP44', 'PCIP45',
# 'PCIP46', 'PCIP47', 'PCIP48', 'PCIP49', 'PCIP50', 'PCIP51',
# 'PCIP52', 'PCIP54']
# cols_admissions = ['ADM_RATE', 'SATVR25', 'SATVR75', 'SATMT25', 'SATMT75', 'SAT_AVG']
cols_costs = ['TUITFTE']
cols_studentbody = ['UGDS', 'UGDS_NRA', 'PPTUG_EF', 'UG25abv',
'PAR_ED_PCT_1STGEN', 'DEP_INC_AVG', 'IND_INC_AVG',
'COMP_ORIG_YR2_RT', 'WDRAW_ORIG_YR2_RT', 'ENRL_ORIG_YR2_RT',
'COMP_ORIG_YR4_RT', 'WDRAW_ORIG_YR4_RT', 'ENRL_ORIG_YR4_RT',
'OVERALL_YR2_N', 'OVERALL_YR3_N', 'OVERALL_YR4_N',
'OVERALL_YR6_N', 'OVERALL_YR8_N', 'count_nwne_p6']
cols_financialaid = ['DEBT_MDN', 'GRAD_DEBT_MDN', 'WDRAW_DEBT_MDN']
cols_other = ['type']
In [7]:
data.shape
Out[7]:
In [8]:
data_reduced = data[cols_target+cols_school+cols_costs+\
cols_studentbody+cols_financialaid+cols_other]
In [9]:
for c in data_reduced.columns:
if (data_reduced[c].dtype == object) and (c != 'type'):
data_reduced[c] = data_reduced[c].apply(lambda x: float(x) if x != 'PrivacySuppressed' else np.nan)
data_reduced[c] = data_reduced[c].astype(float)
In [10]:
data_reduced.describe()
Out[10]:
In [11]:
data_reduced.dropna(inplace=True)
In [12]:
plt.figure(figsize=(12, 12))
sns.regplot(x=data_reduced.PAR_ED_PCT_1STGEN, y=data_reduced.md_earn_wne_p6, color='#348ABD');
plt.title('Earnings and Percentage of First-Generation Students')
plt.xlabel('Percentage of First-Generation Students')
plt.ylabel('Median Earnings Six Years After Entry')
plt.ylim(0, 120000);
plt.gca().get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)
In [13]:
plt.figure(figsize=(10, 12))
sns.boxplot(x="CONTROL", y="md_earn_wne_p6", data=data_reduced,
order=np.sort(data_reduced.CONTROL.unique()), color='White',
fliersize=0, width=0.25)
sns.stripplot(x="CONTROL", y="md_earn_wne_p6", data=data_reduced,
order=np.sort(data_reduced.CONTROL.unique()),
alpha=0.25, size=5,
color='#348ABD', edgecolor='#348ABD')
plt.title('Distribution of Earnings by Institution Type')
plt.xlabel('Institution Type')
plt.ylabel('')
plt.xticks(range(3), ['Public', 'Private Nonprofit', 'Private For-profit'])
plt.ylim(0, 120000);
plt.gca().get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)
In [14]:
plt.figure(figsize=(12, 12))
sns.regplot(x=data_reduced.AVGFACSAL, y=data_reduced.md_earn_wne_p6, color='#348ABD');
plt.title('Earnings and Average Faculty Salary')
plt.xlabel('Average Faculty Salary')
plt.ylabel('Median Earnings Six Years After Entry')
plt.xlim(0, 20000);
plt.ylim(0, 120000);
plt.gca().get_xaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda x, p: format(int(x), ','))
)
plt.gca().get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)
In [15]:
plt.figure(figsize=(10, 12))
sns.boxplot(x="PREDDEG", y="md_earn_wne_p6", data=data_reduced,
order=np.sort(data_reduced.PREDDEG.unique()), color='White',
fliersize=0, width=0.25)
sns.stripplot(x="PREDDEG", y="md_earn_wne_p6", data=data_reduced,
order=np.sort(data_reduced.PREDDEG.unique()),
alpha=0.25, size=5,
color='#348ABD', edgecolor='#348ABD')
plt.title('Distribution of Earnings by Predominant Degree Type')
plt.xlabel('Degree Type')
plt.ylabel('')
plt.xticks(range(4), ['Certificate', 'Associates', 'Bachelor\'s', 'Graduate'])
plt.ylim(0, 120000);
plt.gca().get_yaxis().set_major_formatter(
mpl.ticker.FuncFormatter(lambda y, p: format(int(y), ','))
)
In [ ]: