Jess's DIGBlood IPython notebook



In [80]:

    
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess



In [81]:

    
%%bash
python --version









    



Python 2.7.12 :: Anaconda 2.5.0 (x86_64)



In [82]:

    
import os
bloodPath =  os.path.abspath(os.path.join(os.getcwd(),'..'))
trainPath =  os.path.join(bloodPath,'data','raw','blood_train.csv')
testPath =  os.path.join(bloodPath,'data','raw','blood_train.csv')

df = pd.DataFrame.from_csv(trainPath)
df.columns = [c.replace(' ', '_') for c in df.columns]

dt = pd.DataFrame.from_csv(testPath)
dt.columns = [c.replace(' ', '_') for c in dt.columns]



In [83]:

    
#Show the first few lines of the database
df[:5]









    Out[83]:






  
    
      
      Months_since_Last_Donation
      Number_of_Donations
      Total_Volume_Donated_(c.c.)
      Months_since_First_Donation
      Made_Donation_in_March_2007
    
  
  
    
      619
      2
      50
      12500
      98
      1
    
    
      664
      0
      13
      3250
      28
      1
    
    
      441
      1
      16
      4000
      35
      1
    
    
      160
      2
      20
      5000
      45
      1
    
    
      358
      1
      24
      6000
      77
      0



In [84]:

    
dt[:5]









    Out[84]:






  
    
      
      Months_since_Last_Donation
      Number_of_Donations
      Total_Volume_Donated_(c.c.)
      Months_since_First_Donation
      Made_Donation_in_March_2007
    
  
  
    
      619
      2
      50
      12500
      98
      1
    
    
      664
      0
      13
      3250
      28
      1
    
    
      441
      1
      16
      4000
      35
      1
    
    
      160
      2
      20
      5000
      45
      1
    
    
      358
      1
      24
      6000
      77
      0



In [85]:

    
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])









    Out[85]:





(1.0, 0.0)



In [86]:

    
df['MLDCount'] = df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').transform(lambda x: x.count())
df[['Made_Donation_in_March_2007','MLDCount']].groupby('MLDCount').mean()
df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').count()









    Out[86]:






  
    
      
      Made_Donation_in_March_2007
    
    
      Months_since_Last_Donation
      
    
  
  
    
      0
      4
    
    
      1
      9
    
    
      2
      140
    
    
      3
      10
    
    
      4
      115
    
    
      5
      2
    
    
      6
      4
    
    
      7
      5
    
    
      8
      5
    
    
      9
      17
    
    
      10
      3
    
    
      11
      60
    
    
      12
      5
    
    
      13
      4
    
    
      14
      60
    
    
      15
      1
    
    
      16
      48
    
    
      17
      1
    
    
      18
      2
    
    
      20
      1
    
    
      21
      37
    
    
      22
      1
    
    
      23
      36
    
    
      25
      1
    
    
      26
      1
    
    
      35
      1
    
    
      39
      1
    
    
      72
      1
    
    
      74
      1



In [91]:

    
predLabels = ['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations']
def getCrossTabExact(predLabels):
    dtP = dt
    dtP['Index'] = dtP.index 
    dfP = df.groupby(predLabels).mean().reset_index()
    dfP = dfP.rename(columns = {'Made_Donation_in_March_2007': 'Estimate'})
    dfP['Weight'] = df.groupby(predLabels).count().reset_index()['Made_Donation_in_March_2007']*len(predLabels)
    dtP = pd.merge(dtP,dfP,how ='left',on=predLabels).set_index('Index')
    dtP = dtP[['Estimate','Weight']]
    dtP.fillna(0)
    return dtP

dtTri = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations'])
dtPair1 = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation'])
dtPair2 = getCrossTabExact(['Months_since_Last_Donation','Number_of_Donations'])
dtPair3 = getCrossTabExact(['Months_since_First_Donation','Number_of_Donations'])
dtMSLD = getCrossTabExact(['Months_since_Last_Donation'])
dtMSFD = getCrossTabExact(['Months_since_First_Donation'])
dtND = getCrossTabExact(['Number_of_Donations'])



In [124]:

    
def getCrossTabPred(crossTabs,meanVal,meanRevertWeight):
    weights = np.zeros(len(crosstabs[0]))
    wEst = np.zeros(len(crosstabs[0]))
    for ct in crossTabs:
        wEst = (ct.Estimate*ct.Weight)+wEst
        weights = ct.Weight+weights
    wEst = meanVal*meanRevertWeight + wEst
    weights = weights + meanRevertWeight
    pred = wEst/weights
    return pred



In [125]:

    
meanVal = np.mean(df['Made_Donation_in_March_2007'])
meanRevertWeight = 10
crossTabs = [dtTri,dtPair1,dtPair2 ,dtPair3,dtMSLD,dtMSFD,dtND]
pred = getCrossTabPred(crossTabs,meanVal,meanRevertWeight)



In [126]:

    
pred









    Out[126]:





Index
619    0.425858
664    0.464336
441    0.389931
160    0.414093
358    0.133207
335    0.282260
47     0.388031
164    0.268784
736    0.579861
436    0.201440
460    0.453862
285    0.229167
499    0.455816
356    0.382626
40     0.423367
191    0.432702
638    0.455816
345    0.339923
463    0.339923
372    0.333647
8      0.422862
539    0.422862
734    0.320993
573    0.300398
482    0.337035
330    0.184232
222    0.443117
175    0.443117
606    0.341219
340    0.443759
         ...   
577    0.167095
118    0.215345
532    0.098146
338    0.176326
365    0.209197
173    0.125150
715    0.187660
2      0.152310
387    0.145119
738    0.213905
610    0.142639
450    0.150941
232    0.098169
440    0.098169
183    0.255498
60     0.098169
58     0.098169
49     0.103567
674    0.146083
204    0.139261
361    0.084602
30     0.084602
337    0.084602
496    0.084602
169    0.084602
698    0.084602
433    0.107162
360    0.138613
541    0.113860
74     0.115464
dtype: float64



In [127]:

    
# Training evaluation
from sklearn.metrics import log_loss
actual = df[actualLabel]
print 'Training log-loss score ' + str(log_loss(actual,np.array(pred)))









    



Training log-loss score 0.443372394682



In [129]:

    
X = df.as_matrix()
y = list(df["Made_Donation_in_March_2007"])



In [132]:

    
y









    Out[132]:





[1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]



In [77]:

    
finalPred = pred
dt['Final'] = finalPred



In [53]:

    
print 'Training log-loss score ' + str(log_loss(actual,finalPred))









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-53-be148ae6e42a> in <module>()
----> 1 print 'Training log-loss score ' + str(log_loss(actual,finalPred))

//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in log_loss(y_true, y_pred, eps, normalize, sample_weight)
   1558 
   1559     # Check if dimensions are consistent.
-> 1560     check_consistent_length(T, Y)
   1561     T = check_array(T)
   1562     Y = check_array(Y)

//anaconda/lib/python2.7/site-packages/sklearn/utils/validation.pyc in check_consistent_length(*arrays)
    174     if len(uniques) > 1:
    175         raise ValueError("Found arrays with inconsistent numbers of samples: "
--> 176                          "%s" % str(uniques))
    177 
    178 

ValueError: Found arrays with inconsistent numbers of samples: [200 576]



In [79]:

    
dt['Final'].to_csv('try.csv')



In [ ]:

	Months_since_Last_Donation	Number_of_Donations	Total_Volume_Donated_(c.c.)	Months_since_First_Donation	Made_Donation_in_March_2007
619	2	50	12500	98	1
664	0	13	3250	28	1
441	1	16	4000	35	1
160	2	20	5000	45	1
358	1	24	6000	77	0

	Made_Donation_in_March_2007
Months_since_Last_Donation
0	4
1	9
2	140
3	10
4	115
5	2
6	4
7	5
8	5
9	17
10	3
11	60
12	5
13	4
14	60
15	1
16	48
17	1
18	2
20	1
21	37
22	1
23	36
25	1
26	1
35	1
39	1
72	1
74	1