In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
projects = pd.read_csv('../../data/opendata_projects.csv')

Completed (funded) vs. Expired projects

Here's a breakdown of funding status

In [3]:
projects.groupby('funding_status')['_projectid'].count()


Out[3]:
funding_status
completed         515005
expired           207853
live               41871
reallocated         7200
Name: _projectid, dtype: int64
Look at the percentage of completed projects out of completed and expired

In [4]:
def isfunded(x):
    return float(x[x=='completed'].count())/(x[(x=='completed') | (x=='expired')].count())
isfunded(projects.funding_status)


Out[4]:
0.7124566650711481

In [5]:
# define a plotting function 
def plotbycol(projects,col):
    projects.groupby(col)['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')
    plt.ylabel('fraction funded')
School State

In [6]:
# exclude the one outlier state without enough data (3 entries)
projects.groupby('school_state')['funding_status'].count().sort(inplace=False).head()


Out[6]:
school_state
La                 3
WY               510
ND               551
VT               653
MT              1041
Name: funding_status, dtype: int64

In [7]:
plotbycol(projects[projects['school_state'] != 'La'],'school_state')


Poverty Level

In [8]:
plotbycol(projects,'poverty_level')


Primary Focus Area

In [9]:
plotbycol(projects,'primary_focus_area')


Grade level

In [10]:
plotbycol(projects,'grade_level')


Number of days to completion for completed projects


In [11]:
# get the number of days to completion for completed projects 
sl = projects.funding_status == 'completed'
date2completion = pd.to_datetime(projects['date_completed'][sl]) - pd.to_datetime(projects['date_posted'][sl])
date2completion = date2completion.apply(lambda x: x/np.timedelta64(1, 'D'))

In [12]:
# bin into categories
bins = [0, 1., 30., 150., date2completion.max()]
indices = ['1 day', '30 days', '150 days', '>150 days']
cutsbydays = pd.cut(date2completion, bins, include_lowest=True,labels=indices)
days_completed_grouped = date2completion.groupby(cutsbydays)
days_completed_grouped.count()


Out[12]:
1 day         40465
30 days      233291
150 days     228099
>150 days     13149
dtype: int64

In [13]:
# make a function to compute cuts as percentages rather than raw counts
def normalized_cut(projects,colname,sl,cutsbydays):
    temp = projects[colname][sl].groupby(cutsbydays)
    raw_counts = temp.value_counts()
    total_counts = temp.count()
    total_counts_converted = raw_counts.copy()
    for days in total_counts.index:
        total_counts_converted[days][:] = total_counts[days]
    return raw_counts/total_counts_converted

In [14]:
# define a plotting function 
def plotbycol(projects,colname,sl=sl,cutsbydays=cutsbydays):
    counts_normalized = normalized_cut(projects,colname=colname,sl=sl,cutsbydays=cutsbydays)
    counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
    plt.xlabel('fraction funded')
Poverty level

In [15]:
plotbycol(projects,'poverty_level')


Resources type

In [16]:
plotbycol(projects,'resource_type')


Primary focus area

In [17]:
plotbycol(projects,'primary_focus_area')


School metro

In [18]:
plotbycol(projects,'school_metro')


Budget data


In [19]:
import pymysql as mdb
con = mdb.connect('bayesimpact.soumet.com', 'root', 'bayeshack', 'bayes')

In [20]:
with con:
    cur = con.cursor(mdb.cursors.DictCursor)
    cmd = 'SELECT STATE, NAME, NCESID, SCHLEV, YRDATA, TOTALREV \
           FROM census_data_elsec'
    cur.execute(cmd)
    budget = cur.fetchall()
budget = pd.DataFrame(budget)
budget.head()


Out[20]:
NAME NCESID SCHLEV STATE TOTALREV YRDATA
0 AUTAUGA CO SCH DIST 0100240 03 01 72861.00 07
1 BALDWIN CO SCH DIST 0100270 03 01 272334.00 07
2 BARBOUR CO SCH DIST 0100300 03 01 11427.00 07
3 EUFAULA CTY SCH DIST 0101410 03 01 24423.00 07
4 BIBB CO SCH DIST 0100360 03 01 30744.00 07

In [21]:
# pivot by school district and year to get total revenue 
budget['TOTALREV'] = budget['TOTALREV'].astype(float)
budget_rev = budget.pivot_table(index='NCESID', columns='YRDATA', values='TOTALREV',aggfunc='median')
budget_rev.head()


Out[21]:
YRDATA 07 08 09 10
NCESID
0100005 33690 36664 34170 38927
0100006 51251 56712 53153 57766
0100007 151164 169370 154406 149924
0100008 75560 81067 78207 85030
0100011 26180 14844 12892 13842

In [22]:
# fraciton change in budge 
budget_rev['08_delta'] = (budget_rev['08'] - budget_rev['07'])*100/ budget_rev['07']
budget_rev['09_delta'] = (budget_rev['09'] - budget_rev['08'])*100/ budget_rev['08']
budget_rev['10_delta'] = (budget_rev['10'] - budget_rev['09'])*100/ budget_rev['09']

In [23]:
# massage ncesid format in projects data for join 
projects.loc[(projects.school_ncesid.isnull()),'school_ncesid'] = -1
projects['school_ncesid'] = projects['school_ncesid'].astype(np.int64).astype(str)
projects['dist_ncesid'] = projects['school_ncesid'].apply(lambda x : x[:-5])
projects['dist_ncesid'] = projects['dist_ncesid'].apply(lambda x : '0'+ x if len(x)==6 else x)
projects['dist_ncesid'].head()


Out[23]:
0    3600090
1    3600090
2    3600077
3    3600090
4    3600090
Name: dist_ncesid, dtype: object

In [24]:
# look how school revenue in CA relates to projection funding
# for now we only look at changes from 2009 to 2010
projects_merged = pd.merge(budget_rev, projects[projects.school_state=='CA'], left_index=True, right_on='dist_ncesid')
projects_merged.groupby('funding_status')['10_delta'].median().plot(kind='bar')
plt.ylabel('percent change in budget from 09 to 10')


Out[24]:
<matplotlib.text.Text at 0x11f1cd750>