In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pymysql as mdb

In [131]:
con = mdb.connect('bayesimpact.soumet.com', 'root', 'bayeshack', 'bayes')

In [132]:
with con:
    cur = con.cursor(mdb.cursors.DictCursor)
    cmd = 'SELECT STATE, NAME, NCESID, SCHLEV, YRDATA, TOTALREV \
           FROM census_data_elsec'
    cur.execute(cmd)
    budget = cur.fetchall()
budget = pd.DataFrame(budget)
budget.head()


Out[132]:
NAME NCESID SCHLEV STATE TOTALREV YRDATA
0 AUTAUGA CO SCH DIST 0100240 03 01 72861.00 07
1 BALDWIN CO SCH DIST 0100270 03 01 272334.00 07
2 BARBOUR CO SCH DIST 0100300 03 01 11427.00 07
3 EUFAULA CTY SCH DIST 0101410 03 01 24423.00 07
4 BIBB CO SCH DIST 0100360 03 01 30744.00 07

In [71]:
set(budget.YRDATA)


Out[71]:
{'07', '08', '09', '10'}

In [25]:
budget['TOTALREV'] = budget['TOTALREV'].astype(float)
budget_ca = budget[budget.STATE == '05']

In [135]:
budget_ca.head()


Out[135]:
NAME NCESID SCHLEV STATE TOTALREV YRDATA
680 ALAMEDA CO OFFICE OF ED 0691051 05 05 56050 07
681 ALPINE COUNTY SPECIAL SCHOOLS 0691058 05 05 2036 07
682 AMADOR CO SPL SCHS OPER BY CO SUPT 0691001 05 05 9682 07
683 BUTTE CO SPL SCHS OPER BY CO SUPT 0691002 05 05 75292 07
684 CALAVERAS CO SPL SCHS OPER BY CO SUPT 0691003 05 05 27596 07

In [26]:
budget_ca.groupby('YRDATA')['TOTALREV'].median()


Out[26]:
YRDATA
07        17875
08        17991
09        17739
10        16668
Name: TOTALREV, dtype: float64

In [49]:
budget_rev = budget_ca.pivot_table(index='NCESID', columns='YRDATA', values='TOTALREV',aggfunc='median')
budget_rev.head()


Out[49]:
YRDATA 07 08 09 10
NCESID
0600001 15083 15244 15501 13493
0600006 18603 19417 26203 24322
0600009 4136 3607 3552 3052
0600011 3192 3155 3171 2827
0600012 19103 4265 5695 2172

In [52]:
budget_rev['08_delta'] = (budget_rev['08'] - budget_rev['07'])/ budget_rev['07']
budget_rev['09_delta'] = (budget_rev['09'] - budget_rev['08'])/ budget_rev['08']
budget_rev['10_delta'] = (budget_rev['10'] - budget_rev['09'])/ budget_rev['09']

In [59]:
budget_rev['10_delta'].sort(inplace=False).hist(bins=50)


Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x11304c3d0>

In [60]:
def fetch_data_mdb(con,cmd):
    with con:
        cur = con.cursor(mdb.cursors.DictCursor)
        cur.execute(cmd)
        df = cur.fetchall()
    return pd.DataFrame(df)

In [133]:
cmd = "select school_ncesid FROM donorschoose_projects WHERE school_state = 'CA' LIMIT 10"
test = fetch_data_mdb(con,cmd)

In [74]:
projects = pd.read_csv('../../data/opendata_projects.csv')#dtype={'school_ncesid': np.int64}, na_values={'school_ncesid': None}

In [221]:
projects.loc[(projects.school_ncesid.isnull()),'school_ncesid'] = -1
projects_ca = projects[projects.school_state=='CA']
projects_ca['school_ncesid'] = projects_ca['school_ncesid'].astype(np.int64).astype(str)
#projects_ca['dist_ncesid'] = projects_ca['school_ncesid'].apply(lambda x : '0'+x[:6])
projects_ca['dist_ncesid'] = projects_ca['school_ncesid'].apply(lambda x : x[:-5])

In [228]:
projects_ca['dist_ncesid'] = projects_ca['dist_ncesid'].apply(lambda x : '0'+ x if len(x)==6 else x)
projects_ca['dist_ncesid'].head()


Out[228]:
5937    0611820
5954    0632130
5964    0602310
5967    0632550
5968    0632550
Name: dist_ncesid, dtype: object

In [227]:
# california district id 
ncesid_ca = projects[projects.school_state=='CA']['dist_ncesid']
ncesid_ca.head()


Out[227]:
5937    0611820
5954    0632130
5964    0602310
5967    0632550
5968    0632550
Name: dist_ncesid, dtype: object

In [206]:
projects_merged = pd.merge(budget_ca, projects_ca, left_on='NCESID', right_on='dist_ncesid')
print projects_merged.shape
projects_merged2 = pd.merge(budget_rev, projects_ca, left_index=True, right_on='dist_ncesid')
projects_merged2.shape


(551350, 51)
Out[206]:
(138451, 52)

In [205]:
def isfunded(x):
    return float(x[x=='completed'].count())/(x[(x=='completed') | (x=='expired')].count())
#budget_ca.shape

In [211]:
projects_merged.groupby('funding_status')['TOTALREV'].median().plot(kind='bar')


Out[211]:
<matplotlib.axes._subplots.AxesSubplot at 0x131ec31d0>

In [212]:
projects_merged2.groupby('funding_status')['10_delta'].median().plot(kind='bar')


Out[212]:
<matplotlib.axes._subplots.AxesSubplot at 0x131fe7110>

In [207]:


In [ ]: