In [1]:
# This function drops all the columns with too many 'PrivacySuppressed' and 'Null' or 'NaN'.
# The threshold used here

import pandas as pd
import numpy as np
# import os
# print(os.getcwd())

def getColumns(dataset, threshold=0.9):
    dataset.replace('PrivacvySuppressed', np.NaN)
    numOfRow = dataset.shape[0]
    nanList = dataset.isnull().sum().tolist()
    columnList = list(dataset.columns.values)
    colList = []
    for i in range(len(columnList)):
        column = columnList[i]
        if nanList[i] <= numOfRow * threshold:
            colList.append(column)
    return colList

def getRows(dataset, threshold=0.1):
    numOfRow = dataset.shape[0]
    numOfCol = dataset.shape[1]
    nanList = dataset.isnull().sum(axis=1).tolist()
    rowList = []
    schoolList = []
    idList = []
    for index, row in dataset.iterrows():
        if nanList[index] < numOfCol * threshold:
            rowList.append(index)
            schoolList.append(row['INSTNM'])
            idList.append(row['OPEID'])
    return set(idList)

raw11 = pd.read_csv('../../data/rawData/MERGED2010_11_PP.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
print('read in raw data: 2011')
raw12 = pd.read_csv('../../data/rawData/MERGED2011_12_PP.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
print('read in raw data: 2012')
raw13 = pd.read_csv('../../data/rawData/MERGED2012_13_PP.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
print('read in raw data: 2013')
raw14 = pd.read_csv('../../data/rawData/MERGED2013_14_PP.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
print('read in raw data: 2014')
raw15 = pd.read_csv('../../data/rawData/MERGED2014_15_PP.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
print('read in raw data: 2015')
rawDataList = [raw11, raw12, raw13, raw14, raw15]
nameList = ['year11.csv', 'year12.csv','year13.csv','year14.csv','year15.csv']

# Use the newest data set of 20 as base for choosing columns
thresholdColumn = 0.9
columnList = getColumns(raw15, thresholdColumn)
print('This is the number of columns that we use with a NaN threshold of: ' + str(thresholdColumn))
print(len(columnList))
reducedColumnData = []

for i in range(len(rawDataList)):
	temp = rawDataList[i]
	reduced = temp[columnList]
	reducedColumnData.append(reduced)


read in raw data: 2011
read in raw data: 2012
read in raw data: 2013
read in raw data: 2014
read in raw data: 2015
This is the number of columns that we use with a NaN threshold of: 0.9
569

In [3]:
# Use the intersection of rows to choose rows
rowListSet = []
thresholdRow = 0.9
rowSet11 = getRows(reducedColumnData[0], thresholdRow)
rowSet12 = getRows(reducedColumnData[1], thresholdRow)
rowSet13 = getRows(reducedColumnData[2], thresholdRow)
rowSet14 = getRows(reducedColumnData[3], thresholdRow)
rowSet15 = getRows(reducedColumnData[4], thresholdRow)
rowList = set.intersection(rowSet11, rowSet12, rowSet13, rowSet14, rowSet15)
print('This is number of school OPEID that we use with a NaN threshold of: ' + str(thresholdRow))
print(len(rowList))


This is number of school OPEID that we use with a NaN threshold of: 0.9
6456

In [21]:
temp = reducedColumnData[0]
temp.columns.values


Out[21]:
array(['\ufeffUNITID', 'OPEID', 'OPEID6', 'INSTNM', 'CITY', 'STABBR',
       'ZIP', 'ACCREDAGENCY', 'INSTURL', 'NPCURL', 'HCM2', 'MAIN',
       'NUMBRANCH', 'PREDDEG', 'HIGHDEG', 'CONTROL', 'ST_FIPS', 'REGION',
       'LOCALE', 'LATITUDE', 'LONGITUDE', 'CCBASIC', 'CCUGPROF',
       'CCSIZSET', 'HBCU', 'PBI', 'ANNHI', 'TRIBAL', 'AANAPII', 'HSI',
       'NANTI', 'MENONLY', 'WOMENONLY', 'RELAFFIL', 'ADM_RATE',
       'ADM_RATE_ALL', 'SATVR25', 'SATVR75', 'SATMT25', 'SATMT75',
       'SATVRMID', 'SATMTMID', 'ACTCM25', 'ACTCM75', 'ACTEN25', 'ACTEN75',
       'ACTMT25', 'ACTMT75', 'ACTCMMID', 'ACTENMID', 'ACTMTMID', 'SAT_AVG',
       'SAT_AVG_ALL', 'PCIP01', 'PCIP03', 'PCIP04', 'PCIP05', 'PCIP09',
       'PCIP10', 'PCIP11', 'PCIP12', 'PCIP13', 'PCIP14', 'PCIP15',
       'PCIP16', 'PCIP19', 'PCIP22', 'PCIP23', 'PCIP24', 'PCIP25',
       'PCIP26', 'PCIP27', 'PCIP29', 'PCIP30', 'PCIP31', 'PCIP38',
       'PCIP39', 'PCIP40', 'PCIP41', 'PCIP42', 'PCIP43', 'PCIP44',
       'PCIP45', 'PCIP46', 'PCIP47', 'PCIP48', 'PCIP49', 'PCIP50',
       'PCIP51', 'PCIP52', 'PCIP54', 'CIP01CERT1', 'CIP01CERT2',
       'CIP01ASSOC', 'CIP01CERT4', 'CIP01BACHL', 'CIP03CERT1',
       'CIP03CERT2', 'CIP03ASSOC', 'CIP03CERT4', 'CIP03BACHL',
       'CIP04CERT1', 'CIP04CERT2', 'CIP04ASSOC', 'CIP04CERT4',
       'CIP04BACHL', 'CIP05CERT1', 'CIP05CERT2', 'CIP05ASSOC',
       'CIP05CERT4', 'CIP05BACHL', 'CIP09CERT1', 'CIP09CERT2',
       'CIP09ASSOC', 'CIP09CERT4', 'CIP09BACHL', 'CIP10CERT1',
       'CIP10CERT2', 'CIP10ASSOC', 'CIP10CERT4', 'CIP10BACHL',
       'CIP11CERT1', 'CIP11CERT2', 'CIP11ASSOC', 'CIP11CERT4',
       'CIP11BACHL', 'CIP12CERT1', 'CIP12CERT2', 'CIP12ASSOC',
       'CIP12CERT4', 'CIP12BACHL', 'CIP13CERT1', 'CIP13CERT2',
       'CIP13ASSOC', 'CIP13CERT4', 'CIP13BACHL', 'CIP14CERT1',
       'CIP14CERT2', 'CIP14ASSOC', 'CIP14CERT4', 'CIP14BACHL',
       'CIP15CERT1', 'CIP15CERT2', 'CIP15ASSOC', 'CIP15CERT4',
       'CIP15BACHL', 'CIP16CERT1', 'CIP16CERT2', 'CIP16ASSOC',
       'CIP16CERT4', 'CIP16BACHL', 'CIP19CERT1', 'CIP19CERT2',
       'CIP19ASSOC', 'CIP19CERT4', 'CIP19BACHL', 'CIP22CERT1',
       'CIP22CERT2', 'CIP22ASSOC', 'CIP22CERT4', 'CIP22BACHL',
       'CIP23CERT1', 'CIP23CERT2', 'CIP23ASSOC', 'CIP23CERT4',
       'CIP23BACHL', 'CIP24CERT1', 'CIP24CERT2', 'CIP24ASSOC',
       'CIP24CERT4', 'CIP24BACHL', 'CIP25CERT1', 'CIP25CERT2',
       'CIP25ASSOC', 'CIP25CERT4', 'CIP25BACHL', 'CIP26CERT1',
       'CIP26CERT2', 'CIP26ASSOC', 'CIP26CERT4', 'CIP26BACHL',
       'CIP27CERT1', 'CIP27CERT2', 'CIP27ASSOC', 'CIP27CERT4',
       'CIP27BACHL', 'CIP29CERT1', 'CIP29CERT2', 'CIP29ASSOC',
       'CIP29CERT4', 'CIP29BACHL', 'CIP30CERT1', 'CIP30CERT2',
       'CIP30ASSOC', 'CIP30CERT4', 'CIP30BACHL', 'CIP31CERT1',
       'CIP31CERT2', 'CIP31ASSOC', 'CIP31CERT4', 'CIP31BACHL',
       'CIP38CERT1', 'CIP38CERT2', 'CIP38ASSOC', 'CIP38CERT4',
       'CIP38BACHL', 'CIP39CERT1', 'CIP39CERT2', 'CIP39ASSOC',
       'CIP39CERT4', 'CIP39BACHL', 'CIP40CERT1', 'CIP40CERT2',
       'CIP40ASSOC', 'CIP40CERT4', 'CIP40BACHL', 'CIP41CERT1',
       'CIP41CERT2', 'CIP41ASSOC', 'CIP41CERT4', 'CIP41BACHL',
       'CIP42CERT1', 'CIP42CERT2', 'CIP42ASSOC', 'CIP42CERT4',
       'CIP42BACHL', 'CIP43CERT1', 'CIP43CERT2', 'CIP43ASSOC',
       'CIP43CERT4', 'CIP43BACHL', 'CIP44CERT1', 'CIP44CERT2',
       'CIP44ASSOC', 'CIP44CERT4', 'CIP44BACHL', 'CIP45CERT1',
       'CIP45CERT2', 'CIP45ASSOC', 'CIP45CERT4', 'CIP45BACHL',
       'CIP46CERT1', 'CIP46CERT2', 'CIP46ASSOC', 'CIP46CERT4',
       'CIP46BACHL', 'CIP47CERT1', 'CIP47CERT2', 'CIP47ASSOC',
       'CIP47CERT4', 'CIP47BACHL', 'CIP48CERT1', 'CIP48CERT2',
       'CIP48ASSOC', 'CIP48CERT4', 'CIP48BACHL', 'CIP49CERT1',
       'CIP49CERT2', 'CIP49ASSOC', 'CIP49CERT4', 'CIP49BACHL',
       'CIP50CERT1', 'CIP50CERT2', 'CIP50ASSOC', 'CIP50CERT4',
       'CIP50BACHL', 'CIP51CERT1', 'CIP51CERT2', 'CIP51ASSOC',
       'CIP51CERT4', 'CIP51BACHL', 'CIP52CERT1', 'CIP52CERT2',
       'CIP52ASSOC', 'CIP52CERT4', 'CIP52BACHL', 'CIP54CERT1',
       'CIP54CERT2', 'CIP54ASSOC', 'CIP54CERT4', 'CIP54BACHL',
       'DISTANCEONLY', 'UGDS', 'UGDS_WHITE', 'UGDS_BLACK', 'UGDS_HISP',
       'UGDS_ASIAN', 'UGDS_AIAN', 'UGDS_NHPI', 'UGDS_2MOR', 'UGDS_NRA',
       'UGDS_UNKN', 'PPTUG_EF', 'CURROPER', 'NPT4_PUB', 'NPT4_PRIV',
       'NPT41_PUB', 'NPT42_PUB', 'NPT43_PUB', 'NPT44_PUB', 'NPT45_PUB',
       'NPT41_PRIV', 'NPT42_PRIV', 'NPT43_PRIV', 'NPT44_PRIV',
       'NPT45_PRIV', 'NPT4_048_PUB', 'NPT4_048_PRIV', 'NPT4_3075_PUB',
       'NPT4_3075_PRIV', 'NPT4_75UP_PUB', 'NPT4_75UP_PRIV', 'NUM4_PUB',
       'NUM4_PRIV', 'NUM41_PUB', 'NUM42_PUB', 'NUM43_PUB', 'NUM44_PUB',
       'NUM45_PUB', 'NUM41_PRIV', 'NUM42_PRIV', 'NUM43_PRIV', 'NUM44_PRIV',
       'NUM45_PRIV', 'COSTT4_A', 'COSTT4_P', 'TUITIONFEE_IN',
       'TUITIONFEE_OUT', 'TUITIONFEE_PROG', 'TUITFTE', 'INEXPFTE',
       'AVGFACSAL', 'PFTFAC', 'PCTPELL', 'C150_4', 'C150_L4',
       'C150_4_POOLED', 'C150_L4_POOLED', 'POOLYRS', 'PFTFTUG1_EF',
       'D150_4', 'D150_L4', 'D150_4_POOLED', 'D150_L4_POOLED',
       'C150_4_WHITE', 'C150_4_BLACK', 'C150_4_HISP', 'C150_4_ASIAN',
       'C150_4_AIAN', 'C150_4_2MOR', 'C150_4_NRA', 'C150_4_UNKN',
       'C150_L4_WHITE', 'C150_L4_BLACK', 'C150_L4_HISP', 'C150_L4_ASIAN',
       'C150_L4_AIAN', 'C150_L4_2MOR', 'C150_L4_UNKN', 'C200_4', 'C200_L4',
       'D200_4', 'D200_L4', 'RET_FT4', 'RET_FTL4', 'RET_PT4', 'RET_PTL4',
       'C200_4_POOLED', 'C200_L4_POOLED', 'POOLYRS200', 'D200_4_POOLED',
       'D200_L4_POOLED', 'PCTFLOAN', 'CDR3', 'RPY_3YR_RT',
       'COMPL_RPY_3YR_RT', 'NONCOM_RPY_3YR_RT', 'LO_INC_RPY_3YR_RT',
       'MD_INC_RPY_3YR_RT', 'HI_INC_RPY_3YR_RT', 'DEP_RPY_3YR_RT',
       'IND_RPY_3YR_RT', 'PELL_RPY_3YR_RT', 'NOPELL_RPY_3YR_RT',
       'FEMALE_RPY_3YR_RT', 'MALE_RPY_3YR_RT', 'FIRSTGEN_RPY_3YR_RT',
       'NOTFIRSTGEN_RPY_3YR_RT', 'RPY_5YR_RT', 'COMPL_RPY_5YR_RT',
       'NONCOM_RPY_5YR_RT', 'LO_INC_RPY_5YR_RT', 'MD_INC_RPY_5YR_RT',
       'HI_INC_RPY_5YR_RT', 'DEP_RPY_5YR_RT', 'IND_RPY_5YR_RT',
       'PELL_RPY_5YR_RT', 'NOPELL_RPY_5YR_RT', 'FEMALE_RPY_5YR_RT',
       'MALE_RPY_5YR_RT', 'FIRSTGEN_RPY_5YR_RT', 'NOTFIRSTGEN_RPY_5YR_RT',
       'RPY_7YR_RT', 'COMPL_RPY_7YR_RT', 'NONCOM_RPY_7YR_RT',
       'LO_INC_RPY_7YR_RT', 'MD_INC_RPY_7YR_RT', 'HI_INC_RPY_7YR_RT',
       'DEP_RPY_7YR_RT', 'IND_RPY_7YR_RT', 'PELL_RPY_7YR_RT',
       'NOPELL_RPY_7YR_RT', 'FEMALE_RPY_7YR_RT', 'MALE_RPY_7YR_RT',
       'FIRSTGEN_RPY_7YR_RT', 'NOTFIRSTGEN_RPY_7YR_RT', 'INC_PCT_LO',
       'DEP_STAT_PCT_IND', 'DEP_INC_PCT_LO', 'IND_INC_PCT_LO',
       'PAR_ED_PCT_1STGEN', 'INC_PCT_M1', 'INC_PCT_M2', 'INC_PCT_H1',
       'INC_PCT_H2', 'DEP_INC_PCT_M1', 'DEP_INC_PCT_M2', 'DEP_INC_PCT_H1',
       'DEP_INC_PCT_H2', 'IND_INC_PCT_M1', 'IND_INC_PCT_M2',
       'IND_INC_PCT_H1', 'IND_INC_PCT_H2', 'PAR_ED_PCT_MS',
       'PAR_ED_PCT_HS', 'PAR_ED_PCT_PS', 'APPL_SCH_PCT_GE2',
       'APPL_SCH_PCT_GE3', 'APPL_SCH_PCT_GE4', 'APPL_SCH_PCT_GE5',
       'DEP_INC_AVG', 'IND_INC_AVG', 'DEBT_MDN', 'GRAD_DEBT_MDN',
       'WDRAW_DEBT_MDN', 'LO_INC_DEBT_MDN', 'MD_INC_DEBT_MDN',
       'HI_INC_DEBT_MDN', 'DEP_DEBT_MDN', 'IND_DEBT_MDN', 'PELL_DEBT_MDN',
       'NOPELL_DEBT_MDN', 'FEMALE_DEBT_MDN', 'MALE_DEBT_MDN',
       'FIRSTGEN_DEBT_MDN', 'NOTFIRSTGEN_DEBT_MDN', 'DEBT_N',
       'GRAD_DEBT_N', 'WDRAW_DEBT_N', 'LO_INC_DEBT_N', 'MD_INC_DEBT_N',
       'HI_INC_DEBT_N', 'DEP_DEBT_N', 'IND_DEBT_N', 'PELL_DEBT_N',
       'NOPELL_DEBT_N', 'FEMALE_DEBT_N', 'MALE_DEBT_N', 'FIRSTGEN_DEBT_N',
       'NOTFIRSTGEN_DEBT_N', 'GRAD_DEBT_MDN10YR', 'CUML_DEBT_N',
       'CUML_DEBT_P90', 'CUML_DEBT_P75', 'CUML_DEBT_P25', 'CUML_DEBT_P10',
       'INC_N', 'DEP_INC_N', 'IND_INC_N', 'DEP_STAT_N', 'PAR_ED_N',
       'APPL_SCH_N', 'RPY_3YR_N', 'COMPL_RPY_3YR_N', 'NONCOM_RPY_3YR_N',
       'LO_INC_RPY_3YR_N', 'MD_INC_RPY_3YR_N', 'HI_INC_RPY_3YR_N',
       'DEP_RPY_3YR_N', 'IND_RPY_3YR_N', 'PELL_RPY_3YR_N',
       'NOPELL_RPY_3YR_N', 'FEMALE_RPY_3YR_N', 'MALE_RPY_3YR_N',
       'FIRSTGEN_RPY_3YR_N', 'NOTFIRSTGEN_RPY_3YR_N', 'RPY_5YR_N',
       'COMPL_RPY_5YR_N', 'NONCOM_RPY_5YR_N', 'LO_INC_RPY_5YR_N',
       'MD_INC_RPY_5YR_N', 'HI_INC_RPY_5YR_N', 'DEP_RPY_5YR_N',
       'IND_RPY_5YR_N', 'PELL_RPY_5YR_N', 'NOPELL_RPY_5YR_N',
       'FEMALE_RPY_5YR_N', 'MALE_RPY_5YR_N', 'FIRSTGEN_RPY_5YR_N',
       'NOTFIRSTGEN_RPY_5YR_N', 'RPY_7YR_N', 'COMPL_RPY_7YR_N',
       'NONCOM_RPY_7YR_N', 'LO_INC_RPY_7YR_N', 'MD_INC_RPY_7YR_N',
       'HI_INC_RPY_7YR_N', 'DEP_RPY_7YR_N', 'IND_RPY_7YR_N',
       'PELL_RPY_7YR_N', 'NOPELL_RPY_7YR_N', 'FEMALE_RPY_7YR_N',
       'MALE_RPY_7YR_N', 'FIRSTGEN_RPY_7YR_N', 'NOTFIRSTGEN_RPY_7YR_N',
       'LOAN_EVER', 'PELL_EVER', 'AGE_ENTRY', 'FEMALE', 'MARRIED',
       'DEPENDENT', 'VETERAN', 'FIRST_GEN', 'FAMINC', 'MD_FAMINC',
       'FAMINC_IND', 'DEBT_MDN_SUPP', 'GRAD_DEBT_MDN_SUPP',
       'GRAD_DEBT_MDN10YR_SUPP', 'RPY_3YR_RT_SUPP',
       'LO_INC_RPY_3YR_RT_SUPP', 'MD_INC_RPY_3YR_RT_SUPP',
       'HI_INC_RPY_3YR_RT_SUPP', 'COMPL_RPY_3YR_RT_SUPP',
       'NONCOM_RPY_3YR_RT_SUPP', 'DEP_RPY_3YR_RT_SUPP',
       'IND_RPY_3YR_RT_SUPP', 'PELL_RPY_3YR_RT_SUPP',
       'NOPELL_RPY_3YR_RT_SUPP', 'FEMALE_RPY_3YR_RT_SUPP',
       'MALE_RPY_3YR_RT_SUPP', 'FIRSTGEN_RPY_3YR_RT_SUPP',
       'NOTFIRSTGEN_RPY_3YR_RT_SUPP', 'C150_L4_POOLED_SUPP',
       'C150_4_POOLED_SUPP', 'C200_L4_POOLED_SUPP', 'C200_4_POOLED_SUPP',
       'ALIAS', 'C100_4', 'D100_4', 'C100_L4', 'D100_L4', 'TRANS_4',
       'DTRANS_4', 'TRANS_L4', 'DTRANS_L4', 'ICLEVEL', 'UGDS_MEN',
       'UGDS_WOMEN', 'CDR3_DENOM'], dtype=object)

In [20]:
for i in range(0, len(reducedColumnData)):
    df = reducedColumnData[i]
    temp = df.loc[df['OPEID'].isin(some_values)]


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-20-aa2dc98d9767> in <module>()
      1 for i in range(0, len(reducedColumnData)):
      2     df = reducedColumnData[i]
----> 3     temp = df[df['OPEID'].isin[rowList]]

TypeError: 'method' object is not subscriptable

In [19]:
reducedDataList = []
for i in range(0, len(reducedColumnData)):
    df = reducedColumnData[i]
    temp = df.iloc[df['OPEID'].isin[rowList]]
    reducedDataList.append(temp)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-19-b7d06fa00c46> in <module>()
      2 for i in range(0, len(reducedColumnData)):
      3     df = reducedColumnData[i]
----> 4     temp = df.iloc[df['OPEID'].isin[rowList]]
      5     reducedDataList.append(temp)
      6 

TypeError: 'method' object is not subscriptable

In [ ]:
# Output supportive dataset
path = '../../data/reducedData/'
for i in range(len(reducedDataList)):
    filename = path + nameList[i]
    temp = reducedDataList[i]
    temp.to_csv(filename)
    print(nameList[i] + ' converted.')


# Merge the data set
finalDF = pd.DataFrame()
year11 = reducedDataList[0]
year12 = reducedDataList[1]
year13 = reducedDataList[2]
year14 = reducedDataList[3]
year15 = reducedDataList[4]
# for column in columnList:
#     if year11[column][1].isdigit():
#         finalDF[column] = 0.4 * int(year15[column]) + \
#                           0.3 * int(year14[column]) + 0.2 * int(year13[column]) + \
#                           0.05 * int(year11[column]) + 0.05 * int(year12[column])
#     else:
#         finalDF[column] = year11[column]
# # Output as cleaned-up data
# finalDF.to_csv('../../data/cleanedUp.csv')

In [ ]: