In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load all the data 
resources = pd.read_csv('../../data/opendata_resources.csv')
print 'resources loaded'
giftcards = pd.read_csv('../../data/opendata_giftcards.csv')
print 'giftcards loaded'
projects = pd.read_csv('../../data/opendata_projects.csv')
print 'projects loaded'
donations = pd.read_csv('../../data/opendata_donations.csv')
print 'donations loaded'
res_cat = pd.read_csv('../../data/resource_categories_11.10.14.csv')
print 'categories loaded'


resources loaded
giftcards loaded
projects loaded
donations loaded
categories loaded

In [26]:
resource_type_counts = resources.project_resource_type.value_counts()
print resource_type_counts


Books         1846592
Supplies      1587535
Technology     600771
Other          321910
Trips            6967
Visitors         1567
dtype: int64

In [33]:
for res_typ in list(resource_type_counts.index):
    sl = resources.project_resource_type == res_typ
    print res_typ, '---------'
    print resources.item_name[sl].value_counts().head(n=10)
    print '--------'


Books ---------
Bullying in Schools                             2216
Wonder                                          1282
AA306 - Level E Book Bin - Gr. 1 - Library 1    1147
AA307 - Level F Book Bin - Gr. 1 - Library 1    1081
AA300X - Leveled Books Classroom Library 1      1068
The Fault in Our Stars                          1052
AA302 - Level A Book Bin - K - Library 1        1037
AA308 - Level G Book Bin - Gr. 1 - Library 1    1025
Hunger Games (PB)                               1012
Divergent                                        884
dtype: int64
--------
Supplies ---------
Item Shipping/Handling Charges                               4845
DD105 - Black Write & Wipe Markers - Thin-Line - Set of 4    2105
JJ689 - Privacy Partition - Set of 10                        1796
Standard Shipping                                            1752
JJ518 - Classroom Magnetic Letters Kit                       1642
JJ119X - At-Your-Seat Storage Sack - Set of 10               1570
KC70 - Write & Wipe Lapboard - Set of 10                     1537
RR423 - Write & Wipe Lapboard Erasers - Set of 10            1518
DE190 - Reusable Write & Wipe Pocket - Set of 10             1342
GG963X - Help-Yourself Book Boxes - Set of 6                 1042
dtype: int64
--------
Technology ---------
Apple® iPad® mini Wi-Fi - tablet - iOS 6 - 16 GB - 7.9    12197
Apple® iPad® 2 Wi-Fi - tablet - iOS 5 - 16 GB - 9.7'     4660
Apple® iPad® with Retina display Wi-Fi - 4th generation - tablet - iOS 6 - 16 GB - 9.7'     2467
Apple iPad Mini MD528LL/A (16GB, Wi-Fi, Black & Slate)           1902
MakerBot Academy - 3D Printing Bundle                            1862
Apple® iPad® with Retina display Wi-Fi - 4th generation     1826
Apple iPad 2 Wi-Fi - tablet - iOS 5 - 16 GB - 9.7'               1750
Apple iPod touch digital player - Apple iOS 5                    1724
Epson DC-06 Document Camera - document camera                    1603
EW208 - Wireless Classroom Headphones Set                        1602
dtype: int64
--------
Other ---------
LA952 - A Place for Everyone Classroom Carpet for 30            1251
Edition 3 - 4 with Around the World (Grades 3 and 4)  TIME For Kids is a weekly classroom newsmagazine that motivates kids to read. Issues cover a wide range of real-world topics and themes kids love to learn about   and it s the best nonfiction text you ll find! Edition 3 - 4 builds non-fiction literacy skills and helps students better understand their world.  In addition to receiving Edition 3 - 4, subscribers will receive TFK Around the World.     647
DD105 - Black Write & Wipe Markers - Thin-Line - Set of 4        541
CK30BU - Big Beanbag Seat - Blue                                 491
Edition 5 - 6 with Around the World  (Grades 5 and 6)  TIME For Kids is a weekly classroom newsmagazine that motivates kids to read. Issues cover a wide range of real-world topics and themes kids love to learn about   and it s the best nonfiction text you ll find! Edition 5 - 6 builds non-fiction literacy skills and helps students become informed and active citizens.  In addition to receiving Edition 5 - 6, subscribers will receive TFK Around the World.     445
TT483 - A Place for Everyone Classroom Carpet for 20             437
Item Shipping/Handling Charges                                   419
CK20BU - Little Beanbag Seat - Blue                              372
CK30GR - Big Beanbag Seat - Green                                368
CK30RD - Big Beanbag Seat - Red                                  364
dtype: int64
--------
Trips ---------
Bullying in Schools                                             6
CAMERA,1TIMEUSE,QKSNP,27                                        2
Cozy 50 X 60 Fleece Blanket (Assorted) Throw                    2
CONSTRUCTN PAPR 130G, ASSORTED 9X12 50PK                        2
Gardening Hand Tools Set                                        2
MODELING COMPOUND, 2 LB BUCKET -- Crayola  Model Magic; White    2
KODAK EASY SHARE C310  BUNDLE DIGITAL PHOTO SOLUTION -- Kodak EasyShare C310 Zoom Digital Camera and EasyShare Printer Dock Series 3 Bundle    2
RR724 - Lakeshore Magnetic Fishing Set                          2
BOUND SKETCHBOOK 81/2X11                                        2
National(R) Brand Composition Book, 10 x 7 7/8, College Ruled, 80 Pages (40 Sheets)    2
dtype: int64
--------
Visitors ---------
Apple® iPad® with Retina display Wi-Fi - 4th generation    3
Blades                                                          2
Tarantulas                                                      2
The Truth About Sparrows                                        1
PAINT ACRYLIC WARM RED CHROMACRYL STUDENT HALF GALLON           1
Song of the Water Boatman and Other Pond Poems Joyce Sidman, Beckie Prange (Illustrator)    1
Point Blank (Alex Rider Series #2) Anthony Horowitz             1
I Love Animals Board Book                                       1
Virco Manufacturing  264500 IQ Series Stack Chairs 15 1/2" Seat Height    1
Gankoqui Double Bell (Small)                                    1
dtype: int64
--------

In [34]:
projects.ix[0]


Out[34]:
_projectid                                "e565fb42185c6e9f22806ad9d5ac8a77"
_teacher_acctid                           "2e17c8c91cb58132d8103a9aa8797e80"
_schoolid                                 "45e7ddbdd7023f1eb65a6cc028d7414f"
school_ncesid                                                    3.60009e+11
school_latitude                                                     40.84169
school_longitude                                                   -73.87546
school_city                                                            Bronx
school_state                                                              NY
school_zip                                                             10460
school_metro                                                           urban
school_district                                     New York City Dept Of Ed
school_county                                                          Bronx
school_charter                                                             f
school_magnet                                                              f
school_year_round                                                          f
school_nlns                                                                f
school_kipp                                                                f
school_charter_ready_promise                                               f
teacher_prefix                                                           Mr.
teacher_teach_for_america                                                  f
teacher_ny_teaching_fellow                                                 f
primary_focus_subject                                               Literacy
primary_focus_area                                       Literacy & Language
secondary_focus_subject                                                  NaN
secondary_focus_area                                                     NaN
resource_type                                                          Books
poverty_level                                                highest poverty
grade_level                                                      Grades 9-12
vendor_shipping_charges                                                  NaN
sales_tax                                                                NaN
payment_processing_charges                                               NaN
fulfillment_labor_materials                                              NaN
total_price_excluding_optional_support                                280.02
total_price_including_optional_support                                341.49
students_reached                                                           0
total_donations                                                          308
num_donors                                                                 1
eligible_double_your_impact_match                                          f
eligible_almost_home_match                                                 f
funding_status                                                     completed
date_posted                                                       2007-03-08
date_completed                                                    2007-03-08
date_thank_you_packet_mailed                                      2007-03-08
date_expiration                                                   2003-12-31
Name: 0, dtype: object

In [45]:
projects.total_price_including_optional_support.describe()


Out[45]:
count      771929.000000
mean          676.180708
std         14344.356543
min             0.000000
25%           323.650000
50%           496.000000
75%           733.530000
max      12500020.730000
Name: total_price_including_optional_support, dtype: float64

In [172]:
date2completion = pd.to_datetime(projects['date_completed']) - pd.to_datetime(projects['date_posted'])
print projects['date_posted'].count()
print projects['date_completed'].count()


771929
518711

In [178]:
# merge projects and resources 
projects_resources = pd.merge(projects, resources, on='_projectid')

In [187]:
projects_resources.shape


Out[187]:
(4365548, 52)

In [216]:
import pymysql as mdb
con = mdb.connect('bayesimpact.soumet.com', 'root', 'bayeshack', 'bayes')

In [200]:
projects_resources2 = projects_resources.copy()

In [217]:
# replace pandas nan's with nulls so that they can be written into sql 
for col in projects_resources2.columns:
    #projects_resources2[col][pd.isnull(projects_resources2[col])] = None
    projects_resources2.loc[(pd.isnull(projects_resources2[col])),col] = None

In [220]:
projects_resources2.to_sql(name='project_resources',con=con,flavor='mysql',if_exists='replace',chunksize=50000)

gift cards


In [89]:
giftcards['date_purchased'] = pd.to_datetime(giftcards['date_purchased'])
giftcards['date_redeemed'] = pd.to_datetime(giftcards['date_redeemed'])
giftcards.ix[0]


Out[89]:
_giftcardid                                 "aa25a74a3cc0d0f25b8a9b71320e4125"
gift_code_amount                                                             1
dollar_amount                                                         under_10
_buyer_acctid                               "a69307dacaddf0fce6ef0df36d20411e"
buyer_city                                                         Santa Clara
buyer_state                                                                 CA
buyer_zip                                                                95054
date_purchased                                             2013-02-28 00:00:00
_buyer_cartid                                                              NaN
_recipient_acctid                           "83f9db1f53cf6b0d991a27b0885ee57b"
recipient_city                                                             NaN
recipient_state                                                            NaN
recipient_zip                                                              NaN
redeemed                                                                     t
date_redeemed                                                              NaN
_redeemed_cartid                            "8d4d1579416c7f0ca486c67d36ba757d"
payment_method                                                no_cash_received
payment_included_acct_credit                                                 f
payment_included_campaign_gift_card                                          t
payment_included_web_purchased_gift_card                                     f
payment_was_promo_matched                                                    f
Name: 0, dtype: object

In [157]:
print giftcards.payment_method.value_counts()
giftcards['dollar_amount'].value_counts()


no_cash_received    328844
creditcard          135793
paypal               23877
check                15816
amazon               11527
promo_code_match         9
dtype: int64
Out[157]:
10_to_100     307792
under_10      121921
100_and_up     86153
dtype: int64

In [148]:
giftcards.payment_was_promo_matched.value_counts()


Out[148]:
f    484601
t     31265
dtype: int64

In [154]:
for amt in giftcards['dollar_amount'].value_counts().index:
    temp = (giftcards.dollar_amount==amt)# & (giftcards.buyer_state=='IL')
    temp.index=giftcards.date_purchased
    temp_resampled = temp.resample('1m',how=np.sum)['2009':]
    #temp_resampled.plot()
    plt.plot(temp_resampled.index,temp_resampled,label=amt)
plt.legend(loc='upper left')


Out[154]:
<matplotlib.legend.Legend at 0x15cbfc0d0>

In [147]:
temp = pd.Series(np.ones(len(giftcards._giftcardid)),index=giftcards.date_purchased)
temp.resample('1m',how=np.sum)['2009':].plot()



In [155]:
# payment that was promo_matched 
temp = giftcards.payment_was_promo_matched=='t'
temp.index=giftcards.date_purchased
temp.resample('1m',how=np.mean)['2009':].plot()


Out[155]:
<matplotlib.axes._subplots.AxesSubplot at 0x15cec2bd0>

donations


In [80]:
donations.ix[0]


Out[80]:
_donationid                                                "e5839a8caf71bc61e3f8770de0a1c3fd"
_projectid                                                 "275c6683624bd7f7d1f2c0336de6f2d2"
_donor_acctid                                              "1a81ded85c3556b9804078ed15015c90"
_cartid                                                    "1264db91feff5c67ea2e3625d7149429"
donor_city                                                                                NaN
donor_state                                                                               NaN
donor_zip                                                                                 NaN
is_teacher_acct                                                                             f
donation_timestamp                                                    2010-04-16 22:23:38.803
donation_to_project                                                                       4.1
donation_optional_support                                                                 0.9
donation_total                                                                              5
dollar_amount                                                                        under_10
donation_included_optional_support                                                          t
payment_method                                                               no_cash_received
payment_included_acct_credit                                                                f
payment_included_campaign_gift_card                                                         t
payment_included_web_purchased_gift_card                                                    f
payment_was_promo_matched                                                                   f
via_giving_page                                                                             f
for_honoree                                                                                 f
donation_message                            "I gave to this project because I have experie...
Name: 0, dtype: object

In [81]:
donations.payment_method.value_counts()


Out[81]:
no_cash_received            1686791
creditcard                  1181337
promo_code_match             371258
paypal                       364998
amazon                       137773
double_your_impact_match     128487
check                         31408
almost_home_match             22163
dtype: int64

In [165]:
donations_by_paymentmethods = donations.groupby('payment_method')['dollar_amount'].value_counts()
donations_by_paymentmethods.unstack(level=0).plot(kind='bar')


Out[165]:
<matplotlib.axes._subplots.AxesSubplot at 0x15a048490>

In [ ]: