In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
projects = pd.read_csv('../../data/opendata_projects.csv')

In [19]:
# number of dates to completion -- only look at the completed projects 
#projects_ca = projects.ix[projects.school_state == 'CA']
sl = projects.funding_status == 'completed'
date2completion = pd.to_datetime(projects['date_completed'][sl]) - pd.to_datetime(projects['date_posted'][sl])
date2completion.value_counts().plot()
plt.yscale('log')
#plt.axvline(150)



In [35]:
# number of days to completion -- converted into integer 
# probably non optimal but why can't i get apply to work
def convert_date(d):
    if pd.isnull(d):
        return None
    else:
        return d.days
# date2completion2 = np.zeros(len(date2completion))
# for i,d in enumerate(date2completion):
#     date2completion2[i] = convert_date(d)
# date2completion2 = pd.Series(date2completion2)
date2completion2 = date2completion.apply(lambda x: x/np.timedelta64(1, 'D'))

In [5]:
# bin into categories
bins = [0, 1., 30., 150., date2completion2.max()]
indices = ['1 day', '30 days', '150 days', '>150 days']
cutsbydays = pd.cut(date2completion2, bins, include_lowest=True,labels=indices)
days_completed_grouped = date2completion2.groupby(cutsbydays)

In [18]:
# number of days to completion 
print days_completed_grouped.count()


1 day         40465
30 days      233291
150 days     228099
>150 days     13149
dtype: int64

In [7]:
# compute cuts as percentages rather than raw counts
def normalized_cut(projects,colname,sl,cutsbydays):
    temp = projects[colname][sl].groupby(cutsbydays)
    raw_counts = temp.value_counts()
    total_counts = temp.count()
    total_counts_converted = raw_counts.copy()
    for days in total_counts.index:
        total_counts_converted[days][:] = total_counts[days]
    return raw_counts/total_counts_converted

In [8]:
temp = projects['poverty_level'][sl].groupby(cutsbydays)
raw_counts = temp.value_counts()
total_counts = temp.count()
total_counts_converted = raw_counts.copy()
for days in total_counts.index:
    total_counts_converted[days][:] = total_counts[days]
(raw_counts/total_counts_converted).unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
#print raw_counts
#print (raw_counts/total_counts_converted)


Out[8]:
<matplotlib.legend.Legend at 0x108e70610>

In [41]:
projects['poverty_level'][sl].groupby(cutsbydays).value_counts().unstack(level=0).plot(kind='barh')


Out[41]:
<matplotlib.axes._subplots.AxesSubplot at 0x113ff6810>

In [9]:
#projects['resource_type'][sl].groupby(cutsbydays).value_counts().unstack(level=0).plot(kind='barh')
counts_normalized = normalized_cut(projects,colname='resource_type',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))


Out[9]:
<matplotlib.legend.Legend at 0x10bb823d0>

In [10]:
counts_normalized = normalized_cut(projects,colname='primary_focus_area',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))


Out[10]:
<matplotlib.legend.Legend at 0x10bac4690>

In [11]:
counts_normalized = normalized_cut(projects,colname='school_metro',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))


Out[11]:
<matplotlib.legend.Legend at 0x10bb8c850>

success of funding broken down by categories


In [18]:
projects.groupby('funding_status')['_projectid'].count()


Out[18]:
funding_status
completed         515005
expired           207853
live               41871
reallocated         7200
Name: _projectid, dtype: int64

In [9]:
def isfunded(x):
    return float(x[x=='completed'].count())/(x[(x=='completed') | (x=='expired')].count())
isfunded(projects.funding_status)


Out[9]:
0.7124566650711481

In [13]:
# by state 
#unfunded = date2completion.map(lambda x: pd.isnull(x))
#unfunded.groupby(projects['school_state']).mean().sort(inplace=False,ascending=False).plot(kind='bar')
projects.groupby('school_state')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x114039a50>

In [14]:
# poverty level 
projects.groupby('poverty_level')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1160ad8d0>

In [15]:
# primary focus area 
projects.groupby('primary_focus_area')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x11448eb50>

In [17]:
# grade level 
projects.groupby('grade_level')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x11471a4d0>

In [173]:
projects.groupby('funding_status')['_projectid'].count()


Out[173]:
funding_status
completed         515005
expired           207853
live               41871
reallocated         7200
Name: _projectid, dtype: int64

In [38]:
projects.groupby('funding_status').total_price_excluding_optional_support.median()


Out[38]:
funding_status
completed         377.16
expired           478.43
live              597.70
reallocated       418.55
Name: total_price_excluding_optional_support, dtype: float64

In [ ]: