Jess's DIGBlood IPython notebook


In [2]:
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess


//anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [4]:
df = pd.DataFrame.from_csv('/Users/jesskerlin/Documents/GitHub/digblood/data/raw/blood_train.csv')
df.columns = [c.replace(' ', '_') for c in df.columns]

In [5]:
#Show the first few lines of the database
df[:5]


Out[5]:
Months_since_Last_Donation Number_of_Donations Total_Volume_Donated_(c.c.) Months_since_First_Donation Made_Donation_in_March_2007
619 2 50 12500 98 1
664 0 13 3250 28 1
441 1 16 4000 35 1
160 2 20 5000 45 1
358 1 24 6000 77 0

In [6]:
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])


Out[6]:
(1.0, 0.0)

In [7]:
data = df[['Months_since_Last_Donation','Made_Donation_in_March_2007']].groupby(['Months_since_Last_Donation']).mean().plot(kind = 'bar')



In [8]:
# From http://stackoverflow.com/questions/18517722/weighted-moving-average-in-python 
def weighted_moving_average(x,y,step_size=0.05,width=1):
    bin_centers  = np.arange(np.min(x),np.max(x)-0.5*step_size,step_size)+0.5*step_size
    bin_avg = np.zeros(len(bin_centers))

    #We're going to weight with a Gaussian function
    def gaussian(x,amp=1,mean=0,sigma=1):
        return amp*np.exp(-(x-mean)**2/(2*sigma**2))

    for index in range(0,len(bin_centers)):
        bin_center = bin_centers[index]
        weights = gaussian(x,mean=bin_center,sigma=width)
        bin_avg[index] = np.average(y,weights=weights)

    return (bin_centers,bin_avg)

data = df[['Months_since_First_Donation','Made_Donation_in_March_2007']].groupby(['Months_since_First_Donation']).mean()
count = df[['Months_since_First_Donation','Made_Donation_in_March_2007']].groupby(['Months_since_First_Donation']).mean()
df = df.sort_values('Months_since_Last_Donation')
x = df['Months_since_Last_Donation'].values #.apply(lambda x: np.log(x)).
y = df['Made_Donation_in_March_2007'].values
scipy.stats.halfnorm
x_out,y_out = weighted_moving_average(x,y,step_size = 1,width = 5)
print smoothed
plt.plot(x_out,y_out)
print x

#plt.bar(data.index,data.Made_Donation_in_March_2007)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-3fa1a30b6914> in <module>()
     20 x = df['Months_since_Last_Donation'].values #.apply(lambda x: np.log(x)).
     21 y = df['Made_Donation_in_March_2007'].values
---> 22 scipy.stats.halfnorm
     23 x_out,y_out = weighted_moving_average(x,y,step_size = 1,width = 5)
     24 print smoothed

NameError: name 'scipy' is not defined

In [9]:
print x


[ 0  0  0  0  1  1  1  1  1  1  1  1  1  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2  2
  2  2  2  3  3  3  3  3  3  3  3  3  3  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4
  4  4  4  5  5  6  6  6  6  7  7  7  7  7  8  8  8  8  8  9  9  9  9  9  9
  9  9  9  9  9  9  9  9  9  9  9 10 10 10 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 12
 12 12 12 12 13 13 13 13 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14
 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14
 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 15 16 16 16 16 16 16
 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 17 18 18 20 21 21 21 21
 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21
 21 21 21 21 21 21 21 21 22 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 25 26 35 39 72
 74]

Since Total Volume Donated adds no information, I won't include it as a feature.


In [10]:
df.drop('Total_Volume_Donated_(c.c.)', axis = 1)
model = smf.ols('Made_Donation_in_March_2007 ~ Months_since_Last_Donation + Number_of_Donations + Months_since_First_Donation', data = df)
result = model.fit()
result.summary()
result.fittedvalues


Out[10]:
436    0.366096
214    0.343896
676    0.310818
664    0.515535
358    0.602107
607    0.347716
182    0.285178
164    0.462686
106    0.325693
285    0.448608
330    0.437552
441    0.551485
258    0.394102
291    0.308770
392    0.308770
589    0.308770
220    0.308770
410    0.308770
88     0.308770
619    1.106322
631    0.292705
618    0.275603
238    0.294691
587    0.308770
655    0.261525
700    0.126429
19     0.288645
434    0.218251
195    0.251418
519    0.199074
         ...   
645    0.029055
538    0.059198
451    0.640713
325    0.254074
193    0.104635
378    0.068268
597   -0.109064
295   -0.080818
475    0.021971
141    0.046068
576    0.046068
78     0.059198
110    0.046068
588    0.046068
388    0.046068
117    0.046068
514    0.014976
739    0.036050
604    0.039073
22     0.327756
595    0.046157
210    0.053152
180    0.060236
554    0.046068
183    0.056467
281    0.027804
673   -0.147321
541   -0.154085
74    -0.566903
350   -0.591922
dtype: float64

In [11]:
df.keys()


Out[11]:
Index([u'Months_since_Last_Donation', u'Number_of_Donations',
       u'Total_Volume_Donated_(c.c.)', u'Months_since_First_Donation',
       u'Made_Donation_in_March_2007'],
      dtype='object')

In [14]:
mean = df['Made_Donation_in_March_2007'].mean()
df['Means'] = np.ones([576,1])*mean
print df['Means']


436    0.239583
214    0.239583
676    0.239583
664    0.239583
358    0.239583
607    0.239583
182    0.239583
164    0.239583
106    0.239583
285    0.239583
330    0.239583
441    0.239583
258    0.239583
291    0.239583
392    0.239583
589    0.239583
220    0.239583
410    0.239583
88     0.239583
619    0.239583
631    0.239583
618    0.239583
238    0.239583
587    0.239583
655    0.239583
700    0.239583
19     0.239583
434    0.239583
195    0.239583
519    0.239583
         ...   
645    0.239583
538    0.239583
451    0.239583
325    0.239583
193    0.239583
378    0.239583
597    0.239583
295    0.239583
475    0.239583
141    0.239583
576    0.239583
78     0.239583
110    0.239583
588    0.239583
388    0.239583
117    0.239583
514    0.239583
739    0.239583
604    0.239583
22     0.239583
595    0.239583
210    0.239583
180    0.239583
554    0.239583
183    0.239583
281    0.239583
673    0.239583
541    0.239583
74     0.239583
350    0.239583
Name: Means, dtype: float64

In [ ]:


In [16]:
# Training evaluation
from sklearn.metrics import log_loss
pred = np.array(df.Means)
actual = df['Made_Donation_in_March_2007']
print 'Training log-loss score ' + str(log_loss(actual,pred))


Training log-loss score 0.550599168862

In [18]:
df.describe()


Out[18]:
Months_since_Last_Donation Number_of_Donations Total_Volume_Donated_(c.c.) Months_since_First_Donation Made_Donation_in_March_2007 Means
count 576.000000 576.000000 576.000000 576.000000 576.000000 5.760000e+02
mean 9.439236 5.427083 1356.770833 34.050347 0.239583 2.395833e-01
std 8.175454 5.740010 1435.002556 24.227672 0.427200 1.083408e-15
min 0.000000 1.000000 250.000000 2.000000 0.000000 2.395833e-01
25% 2.000000 2.000000 500.000000 16.000000 0.000000 2.395833e-01
50% 7.000000 4.000000 1000.000000 28.000000 0.000000 2.395833e-01
75% 14.000000 7.000000 1750.000000 49.250000 0.000000 2.395833e-01
max 74.000000 50.000000 12500.000000 98.000000 1.000000 2.395833e-01