notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:

    
projects = pd.read_csv('../../data/opendata_projects.csv')



In [19]:

    
# number of dates to completion -- only look at the completed projects 
#projects_ca = projects.ix[projects.school_state == 'CA']
sl = projects.funding_status == 'completed'
date2completion = pd.to_datetime(projects['date_completed'][sl]) - pd.to_datetime(projects['date_posted'][sl])
date2completion.value_counts().plot()
plt.yscale('log')
#plt.axvline(150)



In [35]:

    
# number of days to completion -- converted into integer 
# probably non optimal but why can't i get apply to work
def convert_date(d):
    if pd.isnull(d):
        return None
    else:
        return d.days
# date2completion2 = np.zeros(len(date2completion))
# for i,d in enumerate(date2completion):
#     date2completion2[i] = convert_date(d)
# date2completion2 = pd.Series(date2completion2)
date2completion2 = date2completion.apply(lambda x: x/np.timedelta64(1, 'D'))



In [5]:

    
# bin into categories
bins = [0, 1., 30., 150., date2completion2.max()]
indices = ['1 day', '30 days', '150 days', '>150 days']
cutsbydays = pd.cut(date2completion2, bins, include_lowest=True,labels=indices)
days_completed_grouped = date2completion2.groupby(cutsbydays)



In [18]:

    
# number of days to completion 
print days_completed_grouped.count()









    



1 day         40465
30 days      233291
150 days     228099
>150 days     13149
dtype: int64



In [7]:

    
# compute cuts as percentages rather than raw counts
def normalized_cut(projects,colname,sl,cutsbydays):
    temp = projects[colname][sl].groupby(cutsbydays)
    raw_counts = temp.value_counts()
    total_counts = temp.count()
    total_counts_converted = raw_counts.copy()
    for days in total_counts.index:
        total_counts_converted[days][:] = total_counts[days]
    return raw_counts/total_counts_converted



In [8]:

    
temp = projects['poverty_level'][sl].groupby(cutsbydays)
raw_counts = temp.value_counts()
total_counts = temp.count()
total_counts_converted = raw_counts.copy()
for days in total_counts.index:
    total_counts_converted[days][:] = total_counts[days]
(raw_counts/total_counts_converted).unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
#print raw_counts
#print (raw_counts/total_counts_converted)









    Out[8]:





<matplotlib.legend.Legend at 0x108e70610>



In [41]:

    
projects['poverty_level'][sl].groupby(cutsbydays).value_counts().unstack(level=0).plot(kind='barh')









    Out[41]:





<matplotlib.axes._subplots.AxesSubplot at 0x113ff6810>



In [9]:

    
#projects['resource_type'][sl].groupby(cutsbydays).value_counts().unstack(level=0).plot(kind='barh')
counts_normalized = normalized_cut(projects,colname='resource_type',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))









    Out[9]:





<matplotlib.legend.Legend at 0x10bb823d0>



In [10]:

    
counts_normalized = normalized_cut(projects,colname='primary_focus_area',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))









    Out[10]:





<matplotlib.legend.Legend at 0x10bac4690>



In [11]:

    
counts_normalized = normalized_cut(projects,colname='school_metro',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))









    Out[11]:





<matplotlib.legend.Legend at 0x10bb8c850>

success of funding broken down by categories



In [18]:

    
projects.groupby('funding_status')['_projectid'].count()









    Out[18]:





funding_status
completed         515005
expired           207853
live               41871
reallocated         7200
Name: _projectid, dtype: int64



In [9]:

    
def isfunded(x):
    return float(x[x=='completed'].count())/(x[(x=='completed') | (x=='expired')].count())
isfunded(projects.funding_status)









    Out[9]:





0.7124566650711481



In [13]:

    
# by state 
#unfunded = date2completion.map(lambda x: pd.isnull(x))
#unfunded.groupby(projects['school_state']).mean().sort(inplace=False,ascending=False).plot(kind='bar')
projects.groupby('school_state')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x114039a50>



In [14]:

    
# poverty level 
projects.groupby('poverty_level')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x1160ad8d0>



In [15]:

    
# primary focus area 
projects.groupby('primary_focus_area')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x11448eb50>



In [17]:

    
# grade level 
projects.groupby('grade_level')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x11471a4d0>



In [173]:

    
projects.groupby('funding_status')['_projectid'].count()









    Out[173]:





funding_status
completed         515005
expired           207853
live               41871
reallocated         7200
Name: _projectid, dtype: int64



In [38]:

    
projects.groupby('funding_status').total_price_excluding_optional_support.median()









    Out[38]:





funding_status
completed         377.16
expired           478.43
live              597.70
reallocated       418.55
Name: total_price_excluding_optional_support, dtype: float64



In [ ]: