Jess's DIGBlood IPython notebook


In [2]:
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess


//anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [3]:
%%bash
python --version


Python 2.7.12 :: Anaconda 2.5.0 (x86_64)

In [4]:
import os
bloodPath =  os.path.abspath(os.path.join(os.getcwd(),'..'))
trainPath =  os.path.join(bloodPath,'data','raw','blood_train.csv')
testPath =  os.path.join(bloodPath,'data','raw','blood_train.csv')

df = pd.DataFrame.from_csv(trainPath)
df.columns = [c.replace(' ', '_') for c in df.columns]

dt = pd.DataFrame.from_csv(testPath)
dt.columns = [c.replace(' ', '_') for c in dt.columns]

In [5]:
#Show the first few lines of the database
df[:5]


Out[5]:
Months_since_Last_Donation Number_of_Donations Total_Volume_Donated_(c.c.) Months_since_First_Donation Made_Donation_in_March_2007
619 2 50 12500 98 1
664 0 13 3250 28 1
441 1 16 4000 35 1
160 2 20 5000 45 1
358 1 24 6000 77 0

In [6]:
dt[:5]


Out[6]:
Months_since_Last_Donation Number_of_Donations Total_Volume_Donated_(c.c.) Months_since_First_Donation Made_Donation_in_March_2007
619 2 50 12500 98 1
664 0 13 3250 28 1
441 1 16 4000 35 1
160 2 20 5000 45 1
358 1 24 6000 77 0

In [7]:
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])


Out[7]:
(1.0, 0.0)

In [16]:
df['MLDCount'] = df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').transform(lambda x: x.count())
df[['Made_Donation_in_March_2007','MLDCount']].groupby('MLDCount').mean()
df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').count()


Out[16]:
Made_Donation_in_March_2007
Months_since_Last_Donation
0 4
1 9
2 140
3 10
4 115
5 2
6 4
7 5
8 5
9 17
10 3
11 60
12 5
13 4
14 60
15 1
16 48
17 1
18 2
20 1
21 37
22 1
23 36
25 1
26 1
35 1
39 1
72 1
74 1

In [23]:
predLabels = ['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations']
def getCrossTabExact(predLabels):
    dtP = dt
    dtP['Index'] = dtP.index 
    dfP = df.groupby(predLabels).mean().reset_index()
    dfP = dfP.rename(columns = {'Made_Donation_in_March_2007': 'Estimate'})
    dfP['Weight'] = df.groupby(predLabels).count().reset_index()['Made_Donation_in_March_2007']
    dtP = pd.merge(dtP,dfP,how ='left',on=predLabels).set_index('Index')
    dtP = dtP[['Estimate','Weight']]
    return dtP

dtTri = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations'])
dtPair1 = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation'])
dtPair2 = getCrossTabExact(['Months_since_Last_Donation','Number_of_Donations'])
dtPair3 = getCrossTabExact(['Months_since_First_Donation','Number_of_Donations'])
dtMSLD = getCrossTabExact(['Months_since_Last_Donation'])
dtMSFD = getCrossTabExact(['Months_since_First_Donation'])
dtND = getCrossTabExact(['Number_of_Donations'])

In [20]:
dtTri.sort_values('Weight')


Out[20]:
Estimate Weight
Index
619 1.000000 1
435 0.000000 1
522 0.000000 1
481 0.000000 1
327 0.000000 1
628 0.000000 1
505 1.000000 1
543 0.000000 1
265 0.000000 1
516 0.000000 1
593 0.000000 1
177 0.000000 1
412 0.000000 1
354 0.000000 1
126 0.000000 1
743 0.000000 1
193 0.000000 1
37 0.000000 1
724 0.000000 1
315 0.000000 1
167 0.000000 1
713 0.000000 1
206 0.000000 1
599 0.000000 1
381 0.000000 1
716 0.000000 1
420 0.000000 1
210 0.000000 1
677 1.000000 1
39 1.000000 1
... ... ...
239 0.095238 21
425 0.095238 21
692 0.178571 28
79 0.178571 28
477 0.178571 28
90 0.178571 28
341 0.178571 28
456 0.178571 28
275 0.178571 28
291 0.178571 28
640 0.178571 28
220 0.178571 28
410 0.178571 28
88 0.178571 28
589 0.178571 28
61 0.178571 28
587 0.178571 28
293 0.178571 28
247 0.178571 28
550 0.178571 28
392 0.178571 28
237 0.178571 28
484 0.178571 28
635 0.178571 28
211 0.178571 28
68 0.178571 28
409 0.178571 28
720 0.178571 28
227 0.178571 28
57 0.178571 28

576 rows × 2 columns


In [24]:
out


Out[24]:
Estimate Weight
Index
619 0.705000 19.280
664 0.666546 5.530
441 0.571982 5.530
160 0.660357 19.280
358 0.057778 2.280
335 0.332060 30.655
47 0.545139 27.905
164 0.193510 5.655
736 0.780000 1.905
436 0.195017 14.780
460 0.601692 24.280
285 0.153016 3.280
499 0.702768 25.780
356 0.513840 29.280
40 0.689375 19.530
191 0.746667 18.780
638 0.702768 25.780
345 0.452606 34.280
463 0.452606 34.280
372 0.225678 21.780
8 0.542054 32.405
539 0.542054 32.405
734 0.206388 17.530
573 0.215798 26.280
482 0.246537 22.155
330 0.123884 4.405
222 0.629797 21.655
175 0.629797 21.655
606 0.111250 20.155
340 0.589065 26.155
... ... ...
577 0.094201 15.655
118 0.147278 11.030
532 0.068494 22.155
338 0.095972 11.530
365 0.285013 23.280
173 0.082553 14.905
715 0.138048 14.030
2 0.143791 20.530
387 0.119597 18.155
738 0.182778 8.905
610 0.115809 16.030
450 0.150847 16.280
232 0.108035 30.405
440 0.108035 30.405
183 0.090268 7.030
60 0.108035 30.405
58 0.108035 30.405
49 0.072688 14.905
674 0.107333 17.905
204 0.158799 18.030
361 0.089053 29.405
30 0.089053 29.405
337 0.089053 29.405
496 0.089053 29.405
169 0.089053 29.405
698 0.089053 29.405
433 0.077028 16.530
360 0.095314 18.780
541 0.064605 16.155
74 0.075021 15.905

576 rows × 2 columns


In [10]:
def meanRevertBins(predictorLabel,actualLabel,meanRevCount):
    dataMean = df[[predictorLabel,actualLabel]].groupby([predictorLabel]).mean()
    dataCount = df[[predictorLabel,actualLabel]].groupby([predictorLabel]).count()
    pred = (dataMean*dataCount+np.mean(dataMean)*meanRevCount)/(dataCount+meanRevCount)
    x = pred.index
    y = pred.values
    
    return x,y

In [11]:
# From http://stackoverflow.com/questions/18517722/weighted-moving-average-in-python 
def uniPredict(x,y,testx,fillval=0):
    pred = np.array([])        
    for test in testx:
        if test in x:
            pred = np.append(pred,y[test==x])
        else:
            pred = np.append(pred,fillval)
    return pred

In [12]:
actualLabel = 'Made_Donation_in_March_2007'
meanRevCount = 3

predictorLabel = 'Months_since_Last_Donation'

x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])  
testx = df[predictorLabel]
pred1 = uniPredict(x,y,testx,fillval=fillval)

predictorLabel = 'Months_since_First_Donation'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])  
testx = df[predictorLabel]
pred2 = uniPredict(x,y,testx,fillval=fillval)

predictorLabel = 'Number_of_Donations'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])  
testx = df[predictorLabel]
pred3 = uniPredict(x,y,testx,fillval=fillval)

predictorLabel = 'MLDCount'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])  
testx = df[predictorLabel]
pred4 = uniPredict(x,y,testx,fillval=fillval)

multipred = (pred1+pred2*3+pred3+pred4)/6

In [13]:
# Training evaluation
from sklearn.metrics import log_loss
actual = df[actualLabel]
print 'Training log-loss score ' + str(log_loss(actual,pred1))
print 'Training log-loss score ' + str(log_loss(actual,pred2))
print 'Training log-loss score ' + str(log_loss(actual,pred3))
print 'Training log-loss score ' + str(log_loss(actual,pred4))
print 'Training log-loss score ' + str(log_loss(actual,multipred))


Training log-loss score 0.48873511625
Training log-loss score 0.484721168487
Training log-loss score 0.507067396522
Training log-loss score 0.494623769427
Training log-loss score 0.476482877656

In [14]:
print 'Training log-loss score ' + str(log_loss(actual,np.array(dtTri['Estimate'])))


Training log-loss score 0.12445527956