In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
projects = pd.read_csv('../../data/opendata_projects.csv')
In [19]:
# number of dates to completion -- only look at the completed projects
#projects_ca = projects.ix[projects.school_state == 'CA']
sl = projects.funding_status == 'completed'
date2completion = pd.to_datetime(projects['date_completed'][sl]) - pd.to_datetime(projects['date_posted'][sl])
date2completion.value_counts().plot()
plt.yscale('log')
#plt.axvline(150)
In [35]:
# number of days to completion -- converted into integer
# probably non optimal but why can't i get apply to work
def convert_date(d):
if pd.isnull(d):
return None
else:
return d.days
# date2completion2 = np.zeros(len(date2completion))
# for i,d in enumerate(date2completion):
# date2completion2[i] = convert_date(d)
# date2completion2 = pd.Series(date2completion2)
date2completion2 = date2completion.apply(lambda x: x/np.timedelta64(1, 'D'))
In [5]:
# bin into categories
bins = [0, 1., 30., 150., date2completion2.max()]
indices = ['1 day', '30 days', '150 days', '>150 days']
cutsbydays = pd.cut(date2completion2, bins, include_lowest=True,labels=indices)
days_completed_grouped = date2completion2.groupby(cutsbydays)
In [18]:
# number of days to completion
print days_completed_grouped.count()
In [7]:
# compute cuts as percentages rather than raw counts
def normalized_cut(projects,colname,sl,cutsbydays):
temp = projects[colname][sl].groupby(cutsbydays)
raw_counts = temp.value_counts()
total_counts = temp.count()
total_counts_converted = raw_counts.copy()
for days in total_counts.index:
total_counts_converted[days][:] = total_counts[days]
return raw_counts/total_counts_converted
In [8]:
temp = projects['poverty_level'][sl].groupby(cutsbydays)
raw_counts = temp.value_counts()
total_counts = temp.count()
total_counts_converted = raw_counts.copy()
for days in total_counts.index:
total_counts_converted[days][:] = total_counts[days]
(raw_counts/total_counts_converted).unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
#print raw_counts
#print (raw_counts/total_counts_converted)
Out[8]:
In [41]:
projects['poverty_level'][sl].groupby(cutsbydays).value_counts().unstack(level=0).plot(kind='barh')
Out[41]:
In [9]:
#projects['resource_type'][sl].groupby(cutsbydays).value_counts().unstack(level=0).plot(kind='barh')
counts_normalized = normalized_cut(projects,colname='resource_type',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
Out[9]:
In [10]:
counts_normalized = normalized_cut(projects,colname='primary_focus_area',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
Out[10]:
In [11]:
counts_normalized = normalized_cut(projects,colname='school_metro',sl=sl,cutsbydays=cutsbydays)
counts_normalized.unstack(level=1).plot(kind='barh',stacked=True).legend(loc='center left',bbox_to_anchor=(1, 0.5))
Out[11]:
In [18]:
projects.groupby('funding_status')['_projectid'].count()
Out[18]:
In [9]:
def isfunded(x):
return float(x[x=='completed'].count())/(x[(x=='completed') | (x=='expired')].count())
isfunded(projects.funding_status)
Out[9]:
In [13]:
# by state
#unfunded = date2completion.map(lambda x: pd.isnull(x))
#unfunded.groupby(projects['school_state']).mean().sort(inplace=False,ascending=False).plot(kind='bar')
projects.groupby('school_state')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')
Out[13]:
In [14]:
# poverty level
projects.groupby('poverty_level')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')
Out[14]:
In [15]:
# primary focus area
projects.groupby('primary_focus_area')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')
Out[15]:
In [17]:
# grade level
projects.groupby('grade_level')['funding_status'].agg(isfunded).sort(inplace=False,ascending=False).plot(kind='bar')
Out[17]:
In [173]:
projects.groupby('funding_status')['_projectid'].count()
Out[173]:
In [38]:
projects.groupby('funding_status').total_price_excluding_optional_support.median()
Out[38]:
In [ ]: