In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:

    
projects = pd.read_csv('../../data/opendata_projects.csv')

Completed (funded) vs. Expired projects

Here's a breakdown of funding status



In [3]:

    
projects.groupby('funding_status')['_projectid'].count()









    Out[3]:





funding_status
completed         515005
expired           207853
live               41871
reallocated         7200
Name: _projectid, dtype: int64

Look at the percentage of completed projects out of completed and expired



In [4]:

    
def isfunded(x):
    return float(x[x=='completed'].count())/(x[(x=='completed') | (x=='expired')].count())
isfunded(projects.funding_status)









    Out[4]:





0.7124566650711481



In [5]:

    
# define a plotting function 
def plotbycol(projects,col):
    projects.groupby(col)['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')
    plt.ylabel('fraction funded')

School State



In [6]:

    
# exclude the one outlier state without enough data (3 entries)
projects.groupby('school_state')['funding_status'].count().sort(inplace=False).head()









    Out[6]:





school_state
La                 3
WY               510
ND               551
VT               653
MT              1041
Name: funding_status, dtype: int64



In [7]:

    
plotbycol(projects[projects['school_state'] != 'La'],'school_state')

Poverty Level



In [8]:

    
plotbycol(projects,'poverty_level')

Primary Focus Area



In [9]:

    
plotbycol(projects,'primary_focus_area')

Grade level



In [10]:

    
plotbycol(projects,'grade_level')

Number of days to completion for completed projects



In [11]:

    
# get the number of days to completion for completed projects 
sl = projects.funding_status == 'completed'
date2completion = pd.to_datetime(projects['date_completed'][sl]) - pd.to_datetime(projects['date_posted'][sl])
date2completion = date2completion.apply(lambda x: x/np.timedelta64(1, 'D'))



In [12]:

    
# bin into categories
bins = [0, 1., 30., 150., date2completion.max()]
indices = ['1 day', '30 days', '150 days', '>150 days']
cutsbydays = pd.cut(date2completion, bins, include_lowest=True,labels=indices)
days_completed_grouped = date2completion.groupby(cutsbydays)
days_completed_grouped.count()









    Out[12]:





1 day         40465
30 days      233291
150 days     228099
>150 days     13149
dtype: int64



In [13]:

    
# make a function to compute cuts as percentages rather than raw counts
def normalized_cut(projects,colname,sl,cutsbydays):
    temp = projects[colname][sl].groupby(cutsbydays)
    raw_counts = temp.value_counts()
    total_counts = temp.count()
    total_counts_converted = raw_counts.copy()
    for days in total_counts.index:
        total_counts_converted[days][:] = total_counts[days]
    return raw_counts/total_counts_converted



In [14]:

    
# define a plotting function 
def plotbycol(projects,colname,sl=sl,cutsbydays=cutsbydays):
    counts_normalized = normalized_cut(projects,colname=colname,sl=sl,cutsbydays=cutsbydays)
    counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
    plt.xlabel('fraction funded')

Poverty level



In [15]:

    
plotbycol(projects,'poverty_level')

Resources type



In [16]:

    
plotbycol(projects,'resource_type')

Primary focus area



In [17]:

    
plotbycol(projects,'primary_focus_area')

School metro



In [18]:

    
plotbycol(projects,'school_metro')

Budget data



In [19]:

    
import pymysql as mdb
con = mdb.connect('bayesimpact.soumet.com', 'root', 'bayeshack', 'bayes')



In [20]:

    
with con:
    cur = con.cursor(mdb.cursors.DictCursor)
    cmd = 'SELECT STATE, NAME, NCESID, SCHLEV, YRDATA, TOTALREV \
           FROM census_data_elsec'
    cur.execute(cmd)
    budget = cur.fetchall()
budget = pd.DataFrame(budget)
budget.head()









    Out[20]:






  
    
      
      NAME
      NCESID
      SCHLEV
      STATE
      TOTALREV
      YRDATA
    
  
  
    
      0
        AUTAUGA CO SCH DIST
       0100240
       03
       01
        72861.00
       07
    
    
      1
        BALDWIN CO SCH DIST
       0100270
       03
       01
       272334.00
       07
    
    
      2
        BARBOUR CO SCH DIST
       0100300
       03
       01
        11427.00
       07
    
    
      3
       EUFAULA CTY SCH DIST
       0101410
       03
       01
        24423.00
       07
    
    
      4
           BIBB CO SCH DIST
       0100360
       03
       01
        30744.00
       07



In [21]:

    
# pivot by school district and year to get total revenue 
budget['TOTALREV'] = budget['TOTALREV'].astype(float)
budget_rev = budget.pivot_table(index='NCESID', columns='YRDATA', values='TOTALREV',aggfunc='median')
budget_rev.head()



In [22]:

    
# fraciton change in budge 
budget_rev['08_delta'] = (budget_rev['08'] - budget_rev['07'])*100/ budget_rev['07']
budget_rev['09_delta'] = (budget_rev['09'] - budget_rev['08'])*100/ budget_rev['08']
budget_rev['10_delta'] = (budget_rev['10'] - budget_rev['09'])*100/ budget_rev['09']



In [23]:

    
# massage ncesid format in projects data for join 
projects.loc[(projects.school_ncesid.isnull()),'school_ncesid'] = -1
projects['school_ncesid'] = projects['school_ncesid'].astype(np.int64).astype(str)
projects['dist_ncesid'] = projects['school_ncesid'].apply(lambda x : x[:-5])
projects['dist_ncesid'] = projects['dist_ncesid'].apply(lambda x : '0'+ x if len(x)==6 else x)
projects['dist_ncesid'].head()









    Out[23]:





0    3600090
1    3600090
2    3600077
3    3600090
4    3600090
Name: dist_ncesid, dtype: object



In [24]:

    
# look how school revenue in CA relates to projection funding
# for now we only look at changes from 2009 to 2010
projects_merged = pd.merge(budget_rev, projects[projects.school_state=='CA'], left_index=True, right_on='dist_ncesid')
projects_merged.groupby('funding_status')['10_delta'].median().plot(kind='bar')
plt.ylabel('percent change in budget from 09 to 10')









    Out[24]:





<matplotlib.text.Text at 0x11f1cd750>

YRDATA	07	08	09	10
NCESID
0100005	33690	36664	34170	38927
0100006	51251	56712	53153	57766
0100007	151164	169370	154406	149924
0100008	75560	81067	78207	85030
0100011	26180	14844	12892	13842

	NAME	NCESID	SCHLEV	STATE	TOTALREV	YRDATA
0	AUTAUGA CO SCH DIST	0100240	03	01	72861.00	07
1	BALDWIN CO SCH DIST	0100270	03	01	272334.00	07
2	BARBOUR CO SCH DIST	0100300	03	01	11427.00	07
3	EUFAULA CTY SCH DIST	0101410	03	01	24423.00	07
4	BIBB CO SCH DIST	0100360	03	01	30744.00	07