In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymysql as mdb
In [131]:
con = mdb.connect('bayesimpact.soumet.com', 'root', 'bayeshack', 'bayes')
In [132]:
with con:
cur = con.cursor(mdb.cursors.DictCursor)
cmd = 'SELECT STATE, NAME, NCESID, SCHLEV, YRDATA, TOTALREV \
FROM census_data_elsec'
cur.execute(cmd)
budget = cur.fetchall()
budget = pd.DataFrame(budget)
budget.head()
Out[132]:
In [71]:
set(budget.YRDATA)
Out[71]:
In [25]:
budget['TOTALREV'] = budget['TOTALREV'].astype(float)
budget_ca = budget[budget.STATE == '05']
In [135]:
budget_ca.head()
Out[135]:
In [26]:
budget_ca.groupby('YRDATA')['TOTALREV'].median()
Out[26]:
In [49]:
budget_rev = budget_ca.pivot_table(index='NCESID', columns='YRDATA', values='TOTALREV',aggfunc='median')
budget_rev.head()
Out[49]:
In [52]:
budget_rev['08_delta'] = (budget_rev['08'] - budget_rev['07'])/ budget_rev['07']
budget_rev['09_delta'] = (budget_rev['09'] - budget_rev['08'])/ budget_rev['08']
budget_rev['10_delta'] = (budget_rev['10'] - budget_rev['09'])/ budget_rev['09']
In [59]:
budget_rev['10_delta'].sort(inplace=False).hist(bins=50)
Out[59]:
In [60]:
def fetch_data_mdb(con,cmd):
with con:
cur = con.cursor(mdb.cursors.DictCursor)
cur.execute(cmd)
df = cur.fetchall()
return pd.DataFrame(df)
In [133]:
cmd = "select school_ncesid FROM donorschoose_projects WHERE school_state = 'CA' LIMIT 10"
test = fetch_data_mdb(con,cmd)
In [74]:
projects = pd.read_csv('../../data/opendata_projects.csv')#dtype={'school_ncesid': np.int64}, na_values={'school_ncesid': None}
In [221]:
projects.loc[(projects.school_ncesid.isnull()),'school_ncesid'] = -1
projects_ca = projects[projects.school_state=='CA']
projects_ca['school_ncesid'] = projects_ca['school_ncesid'].astype(np.int64).astype(str)
#projects_ca['dist_ncesid'] = projects_ca['school_ncesid'].apply(lambda x : '0'+x[:6])
projects_ca['dist_ncesid'] = projects_ca['school_ncesid'].apply(lambda x : x[:-5])
In [228]:
projects_ca['dist_ncesid'] = projects_ca['dist_ncesid'].apply(lambda x : '0'+ x if len(x)==6 else x)
projects_ca['dist_ncesid'].head()
Out[228]:
In [227]:
# california district id
ncesid_ca = projects[projects.school_state=='CA']['dist_ncesid']
ncesid_ca.head()
Out[227]:
In [206]:
projects_merged = pd.merge(budget_ca, projects_ca, left_on='NCESID', right_on='dist_ncesid')
print projects_merged.shape
projects_merged2 = pd.merge(budget_rev, projects_ca, left_index=True, right_on='dist_ncesid')
projects_merged2.shape
Out[206]:
In [205]:
def isfunded(x):
return float(x[x=='completed'].count())/(x[(x=='completed') | (x=='expired')].count())
#budget_ca.shape
In [211]:
projects_merged.groupby('funding_status')['TOTALREV'].median().plot(kind='bar')
Out[211]:
In [212]:
projects_merged2.groupby('funding_status')['10_delta'].median().plot(kind='bar')
Out[212]:
In [207]:
In [ ]: