Jess's DIGBlood IPython notebook


In [80]:
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess

In [81]:
%%bash
python --version


Python 2.7.12 :: Anaconda 2.5.0 (x86_64)

In [82]:
import os
bloodPath =  os.path.abspath(os.path.join(os.getcwd(),'..'))
trainPath =  os.path.join(bloodPath,'data','raw','blood_train.csv')
testPath =  os.path.join(bloodPath,'data','raw','blood_train.csv')

df = pd.DataFrame.from_csv(trainPath)
df.columns = [c.replace(' ', '_') for c in df.columns]

dt = pd.DataFrame.from_csv(testPath)
dt.columns = [c.replace(' ', '_') for c in dt.columns]

In [83]:
#Show the first few lines of the database
df[:5]


Out[83]:
Months_since_Last_Donation Number_of_Donations Total_Volume_Donated_(c.c.) Months_since_First_Donation Made_Donation_in_March_2007
619 2 50 12500 98 1
664 0 13 3250 28 1
441 1 16 4000 35 1
160 2 20 5000 45 1
358 1 24 6000 77 0

In [84]:
dt[:5]


Out[84]:
Months_since_Last_Donation Number_of_Donations Total_Volume_Donated_(c.c.) Months_since_First_Donation Made_Donation_in_March_2007
619 2 50 12500 98 1
664 0 13 3250 28 1
441 1 16 4000 35 1
160 2 20 5000 45 1
358 1 24 6000 77 0

In [85]:
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])


Out[85]:
(1.0, 0.0)

In [86]:
df['MLDCount'] = df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').transform(lambda x: x.count())
df[['Made_Donation_in_March_2007','MLDCount']].groupby('MLDCount').mean()
df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').count()


Out[86]:
Made_Donation_in_March_2007
Months_since_Last_Donation
0 4
1 9
2 140
3 10
4 115
5 2
6 4
7 5
8 5
9 17
10 3
11 60
12 5
13 4
14 60
15 1
16 48
17 1
18 2
20 1
21 37
22 1
23 36
25 1
26 1
35 1
39 1
72 1
74 1

In [91]:
predLabels = ['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations']
def getCrossTabExact(predLabels):
    dtP = dt
    dtP['Index'] = dtP.index 
    dfP = df.groupby(predLabels).mean().reset_index()
    dfP = dfP.rename(columns = {'Made_Donation_in_March_2007': 'Estimate'})
    dfP['Weight'] = df.groupby(predLabels).count().reset_index()['Made_Donation_in_March_2007']*len(predLabels)
    dtP = pd.merge(dtP,dfP,how ='left',on=predLabels).set_index('Index')
    dtP = dtP[['Estimate','Weight']]
    dtP.fillna(0)
    return dtP

dtTri = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations'])
dtPair1 = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation'])
dtPair2 = getCrossTabExact(['Months_since_Last_Donation','Number_of_Donations'])
dtPair3 = getCrossTabExact(['Months_since_First_Donation','Number_of_Donations'])
dtMSLD = getCrossTabExact(['Months_since_Last_Donation'])
dtMSFD = getCrossTabExact(['Months_since_First_Donation'])
dtND = getCrossTabExact(['Number_of_Donations'])

In [124]:
def getCrossTabPred(crossTabs,meanVal,meanRevertWeight):
    weights = np.zeros(len(crosstabs[0]))
    wEst = np.zeros(len(crosstabs[0]))
    for ct in crossTabs:
        wEst = (ct.Estimate*ct.Weight)+wEst
        weights = ct.Weight+weights
    wEst = meanVal*meanRevertWeight + wEst
    weights = weights + meanRevertWeight
    pred = wEst/weights
    return pred

In [125]:
meanVal = np.mean(df['Made_Donation_in_March_2007'])
meanRevertWeight = 10
crossTabs = [dtTri,dtPair1,dtPair2 ,dtPair3,dtMSLD,dtMSFD,dtND]
pred = getCrossTabPred(crossTabs,meanVal,meanRevertWeight)

In [126]:
pred


Out[126]:
Index
619    0.425858
664    0.464336
441    0.389931
160    0.414093
358    0.133207
335    0.282260
47     0.388031
164    0.268784
736    0.579861
436    0.201440
460    0.453862
285    0.229167
499    0.455816
356    0.382626
40     0.423367
191    0.432702
638    0.455816
345    0.339923
463    0.339923
372    0.333647
8      0.422862
539    0.422862
734    0.320993
573    0.300398
482    0.337035
330    0.184232
222    0.443117
175    0.443117
606    0.341219
340    0.443759
         ...   
577    0.167095
118    0.215345
532    0.098146
338    0.176326
365    0.209197
173    0.125150
715    0.187660
2      0.152310
387    0.145119
738    0.213905
610    0.142639
450    0.150941
232    0.098169
440    0.098169
183    0.255498
60     0.098169
58     0.098169
49     0.103567
674    0.146083
204    0.139261
361    0.084602
30     0.084602
337    0.084602
496    0.084602
169    0.084602
698    0.084602
433    0.107162
360    0.138613
541    0.113860
74     0.115464
dtype: float64

In [127]:
# Training evaluation
from sklearn.metrics import log_loss
actual = df[actualLabel]
print 'Training log-loss score ' + str(log_loss(actual,np.array(pred)))


Training log-loss score 0.443372394682

In [129]:
X = df.as_matrix()
y = list(df["Made_Donation_in_March_2007"])

In [132]:
y


Out[132]:
[1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [77]:
finalPred = pred
dt['Final'] = finalPred

In [53]:
print 'Training log-loss score ' + str(log_loss(actual,finalPred))


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-53-be148ae6e42a> in <module>()
----> 1 print 'Training log-loss score ' + str(log_loss(actual,finalPred))

//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in log_loss(y_true, y_pred, eps, normalize, sample_weight)
   1558 
   1559     # Check if dimensions are consistent.
-> 1560     check_consistent_length(T, Y)
   1561     T = check_array(T)
   1562     Y = check_array(Y)

//anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_consistent_length(*arrays)
    174     if len(uniques) > 1:
    175         raise ValueError("Found arrays with inconsistent numbers of samples: "
--> 176                          "%s" % str(uniques))
    177 
    178 

ValueError: Found arrays with inconsistent numbers of samples: [200 576]

In [79]:
dt['Final'].to_csv('try.csv')

In [ ]: