Clean Data

Story

Show US average of income, debt, net price for schools over the years. Then, allow for users to see how their schools compare.

Goal

  • Clean three variables
  • Find the averages for each
  • Store data in JSON
    • (school_name, income, debt, net_price)
    • average will have school_name == 'avg'

In [2]:
%matplotlib inline

import matplotlib.pyplot as plt
import sqlite3
import pandas as pd
import seaborn as sns

sns.set_style("white")

conn = sqlite3.connect('../data/output/database.sqlite')
c = conn.cursor()

def execute(sql):
    '''Executes a SQL command on the 'c' cursor and returns the results'''
    c.execute(sql)
    return c.fetchall()

def printByYear(data):
    '''Given a list of tuples with (year, data), prints the data next to corresponding year'''
    for datum in data:
        print "{0}: {1}".format(datum[0], datum[1])
        


def adjustForInflation(value, year):
    '''Adjust the dollar value based on year
    Source (http://www.bls.gov/data/inflation_calculator.htm)
    '''
    if value == None:
        return
    return int(value * years[year])

In [3]:
query = """SELECT INSTNM, YEAR, 
                TUITIONFEE_IN, TUITIONFEE_OUT,
                DEBT_MDN, 
                mn_earn_wne_p10, md_earn_wne_p10, pct10_earn_wne_p10, pct25_earn_wne_p10, 
                pct75_earn_wne_p10, pct90_earn_wne_p10
            FROM Scorecard
            WHERE MAIN='Main campus'
              AND PREDDEG = 'Predominantly bachelor''s-degree granting'
        """
rawData = execute(query)

query = """SELECT INSTNM
            FROM Scorecard
            WHERE MAIN='Main campus'
              AND PREDDEG = 'Predominantly bachelor''s-degree granting'
              and CCBASIC NOT LIKE '%Special%'"""
institutions = execute(query)

Now that I have the raw data I want to structure it in a JSON like fashion. instnm -> year -> data


In [67]:
JSON_data = {} # Inflation adjusted JSON structured tree about different institutions

# Get main campus, predominantly bachelor's, non-medical schools that had data for 2013
for inst in institutions:
    JSON_data[inst[0]] = {}

def setValue(data, key, value, year):
    if value and value != 'PrivacySuppressed':
        data[key] = adjustForInflation(value, year)
    
for row in rawData:
    instnm = row[0]
    year = row[1]
    
    if instnm in JSON_data:
        financialData = {}
        
        setValue(financialData, 'tuition_in', row[2], year)
        setValue(financialData, 'tuition_out', row[3], year)
        setValue(financialData, 'debt_mdn', row[4], year)
        setValue(financialData, 'mn_earn', row[5], year)
        setValue(financialData, 'md_earn', row[6], year)
        setValue(financialData, 'md_earn_10', row[7], year)
        setValue(financialData, 'md_earn_25', row[8], year)
        setValue(financialData, 'md_earn_75', row[9], year)
        setValue(financialData, 'md_earn_90', row[10], year)

        if len(financialData) != 0:
            JSON_data[instnm][year] = financialData

# Create averages
JSON_data['avg'] = {}
counts = {}
i=0
for school in JSON_data:
    for year in JSON_data[school]:
        if year not in JSON_data['avg']:
            JSON_data['avg'][year] = JSON_data[school][year]
            counts[year] = {}
            for elem in JSON_data[school][year]:
                counts[year][elem] = 1
        else:
            for elem in JSON_data[school][year]:
                if elem not in JSON_data['avg'][year]:
                    JSON_data['avg'][year][elem] = JSON_data[school][year][elem]
                    counts[year][elem] = 1
                else:
                    JSON_data['avg'][year][elem] += JSON_data[school][year][elem]
                    counts[year][elem] += 1

for year in JSON_data['avg']:
    for elem in JSON_data['avg'][year]:
        JSON_data['avg'][year][elem] = JSON_data['avg'][year][elem] / counts[year][elem]
        
print JSON_data['avg']


{2001: {'tuition_in': 18720, 'debt_mdn': 16461, 'tuition_out': 18720}, 2002: {'tuition_in': 10836, 'debt_mdn': 18067, 'tuition_out': 10836}, 2005: {'tuition_in': 31840, 'debt_mdn': 27611, 'tuition_out': 38402}, 2006: {'tuition_in': 32834, 'debt_mdn': 27400, 'tuition_out': 39206}, 2007: {'md_earn_10': 27275, 'md_earn': 94818, 'debt_mdn': 27242, 'tuition_out': 40414, 'tuition_in': 33943, 'md_earn_75': 134912, 'md_earn_90': 186380, 'mn_earn': 105615, 'md_earn_25': 60355}, 2008: {'tuition_in': 34668, 'debt_mdn': 27370, 'tuition_out': 41178}, 2009: {'md_earn_10': 24155, 'md_earn': 89727, 'debt_mdn': 29367, 'tuition_out': 43148, 'tuition_in': 36324, 'md_earn_75': 127247, 'md_earn_90': 174552, 'mn_earn': 98862, 'md_earn_25': 55809}, 2010: {'tuition_in': 37122, 'debt_mdn': 30845, 'tuition_out': 44033}, 2011: {'md_earn_10': 23374, 'md_earn': 82693, 'debt_mdn': 31393, 'tuition_out': 44592, 'tuition_in': 37781, 'md_earn_75': 118458, 'md_earn_90': 164430, 'mn_earn': 92511, 'md_earn_25': 51644}, 2012: {'tuition_in': 38515, 'debt_mdn': 31897, 'tuition_out': 45410}, 2013: {'tuition_in': 39573, 'debt_mdn': 32927, 'tuition_out': 46593}}

In [68]:
JSON_data['Harvard University']


Out[68]:
{2005: {'debt_mdn': 9263, 'tuition_in': 37874, 'tuition_out': 37874},
 2006: {'debt_mdn': 9329, 'tuition_in': 38428, 'tuition_out': 38428},
 2007: {'debt_mdn': 9610,
  'md_earn': 96570,
  'md_earn_10': 24198,
  'md_earn_25': 55611,
  'md_earn_75': 199467,
  'md_earn_90': 277500,
  'mn_earn': 164724,
  'tuition_in': 38847,
  'tuition_out': 38847},
 2008: {'debt_mdn': 6420, 'tuition_in': 38705, 'tuition_out': 38705},
 2009: {'debt_mdn': 6420,
  'md_earn': 82176,
  'md_earn_10': 20116,
  'md_earn_25': 49755,
  'md_earn_75': 169167,
  'md_earn_90': 257228,
  'mn_earn': 128721,
  'tuition_in': 39602,
  'tuition_out': 39602},
 2010: {'debt_mdn': 6300, 'tuition_in': 40335, 'tuition_out': 40335},
 2011: {'debt_mdn': 6120,
  'md_earn': 88944,
  'md_earn_10': 25194,
  'md_earn_25': 52632,
  'md_earn_75': 173502,
  'md_earn_90': 255000,
  'mn_earn': 133110,
  'tuition_in': 40648,
  'tuition_out': 40648},
 2012: {'debt_mdn': 6250, 'tuition_in': 39966, 'tuition_out': 39966},
 2013: {'debt_mdn': 7500, 'tuition_in': 42292, 'tuition_out': 42292}}

In [ ]: