In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
projects = pd.read_csv('../../data/opendata_projects.csv')
In [3]:
projects.groupby('funding_status')['_projectid'].count()
Out[3]:
In [4]:
def isfunded(x):
return float(x[x=='completed'].count())/(x[(x=='completed') | (x=='expired')].count())
isfunded(projects.funding_status)
Out[4]:
In [5]:
# define a plotting function
def plotbycol(projects,col):
projects.groupby(col)['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')
plt.ylabel('fraction funded')
In [6]:
# exclude the one outlier state without enough data (3 entries)
projects.groupby('school_state')['funding_status'].count().sort(inplace=False).head()
Out[6]:
In [7]:
plotbycol(projects[projects['school_state'] != 'La'],'school_state')
In [8]:
plotbycol(projects,'poverty_level')
In [9]:
plotbycol(projects,'primary_focus_area')
In [10]:
plotbycol(projects,'grade_level')
In [11]:
# get the number of days to completion for completed projects
sl = projects.funding_status == 'completed'
date2completion = pd.to_datetime(projects['date_completed'][sl]) - pd.to_datetime(projects['date_posted'][sl])
date2completion = date2completion.apply(lambda x: x/np.timedelta64(1, 'D'))
In [12]:
# bin into categories
bins = [0, 1., 30., 150., date2completion.max()]
indices = ['1 day', '30 days', '150 days', '>150 days']
cutsbydays = pd.cut(date2completion, bins, include_lowest=True,labels=indices)
days_completed_grouped = date2completion.groupby(cutsbydays)
days_completed_grouped.count()
Out[12]:
In [13]:
# make a function to compute cuts as percentages rather than raw counts
def normalized_cut(projects,colname,sl,cutsbydays):
temp = projects[colname][sl].groupby(cutsbydays)
raw_counts = temp.value_counts()
total_counts = temp.count()
total_counts_converted = raw_counts.copy()
for days in total_counts.index:
total_counts_converted[days][:] = total_counts[days]
return raw_counts/total_counts_converted
In [14]:
# define a plotting function
def plotbycol(projects,colname,sl=sl,cutsbydays=cutsbydays):
counts_normalized = normalized_cut(projects,colname=colname,sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
plt.xlabel('fraction funded')
In [15]:
plotbycol(projects,'poverty_level')
In [16]:
plotbycol(projects,'resource_type')
In [17]:
plotbycol(projects,'primary_focus_area')
In [18]:
plotbycol(projects,'school_metro')
In [19]:
import pymysql as mdb
con = mdb.connect('bayesimpact.soumet.com', 'root', 'bayeshack', 'bayes')
In [20]:
with con:
cur = con.cursor(mdb.cursors.DictCursor)
cmd = 'SELECT STATE, NAME, NCESID, SCHLEV, YRDATA, TOTALREV \
FROM census_data_elsec'
cur.execute(cmd)
budget = cur.fetchall()
budget = pd.DataFrame(budget)
budget.head()
Out[20]:
In [21]:
# pivot by school district and year to get total revenue
budget['TOTALREV'] = budget['TOTALREV'].astype(float)
budget_rev = budget.pivot_table(index='NCESID', columns='YRDATA', values='TOTALREV',aggfunc='median')
budget_rev.head()
Out[21]:
In [22]:
# fraciton change in budge
budget_rev['08_delta'] = (budget_rev['08'] - budget_rev['07'])*100/ budget_rev['07']
budget_rev['09_delta'] = (budget_rev['09'] - budget_rev['08'])*100/ budget_rev['08']
budget_rev['10_delta'] = (budget_rev['10'] - budget_rev['09'])*100/ budget_rev['09']
In [23]:
# massage ncesid format in projects data for join
projects.loc[(projects.school_ncesid.isnull()),'school_ncesid'] = -1
projects['school_ncesid'] = projects['school_ncesid'].astype(np.int64).astype(str)
projects['dist_ncesid'] = projects['school_ncesid'].apply(lambda x : x[:-5])
projects['dist_ncesid'] = projects['dist_ncesid'].apply(lambda x : '0'+ x if len(x)==6 else x)
projects['dist_ncesid'].head()
Out[23]:
In [24]:
# look how school revenue in CA relates to projection funding
# for now we only look at changes from 2009 to 2010
projects_merged = pd.merge(budget_rev, projects[projects.school_state=='CA'], left_index=True, right_on='dist_ncesid')
projects_merged.groupby('funding_status')['10_delta'].median().plot(kind='bar')
plt.ylabel('percent change in budget from 09 to 10')
Out[24]: