In [197]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import scipy
import numpy
import sklearn
#from sklearn import datasets, linear_model, metrics

In [198]:
b = pd.DataFrame.from_csv('../data/raw/blood_train.csv')

In [199]:
b[:5]


Out[199]:
Months since Last Donation Number of Donations Total Volume Donated (c.c.) Months since First Donation Made Donation in March 2007
619 2 50 12500 98 1
664 0 13 3250 28 1
441 1 16 4000 35 1
160 2 20 5000 45 1
358 1 24 6000 77 0

In [200]:
b.describe()


Out[200]:
Months since Last Donation Number of Donations Total Volume Donated (c.c.) Months since First Donation Made Donation in March 2007
count 576.000000 576.000000 576.000000 576.000000 576.000000
mean 9.439236 5.427083 1356.770833 34.050347 0.239583
std 8.175454 5.740010 1435.002556 24.227672 0.427200
min 0.000000 1.000000 250.000000 2.000000 0.000000
25% 2.000000 2.000000 500.000000 16.000000 0.000000
50% 7.000000 4.000000 1000.000000 28.000000 0.000000
75% 14.000000 7.000000 1750.000000 49.250000 0.000000
max 74.000000 50.000000 12500.000000 98.000000 1.000000

In [201]:
#saving this to remember how to plot inline
%matplotlib inline
plt.scatter(x=b['Months since Last Donation'], y=b['Months since First Donation'], c=b['Made Donation in March 2007'], cmap='prism')


Out[201]:
<matplotlib.collections.PathCollection at 0x11f1db790>

In [202]:
%matplotlib inline
b.hist(figsize=(20,20))


Out[202]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11f1e0210>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x11fe7afd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x12000ac90>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12007d0d0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1200fd190>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1200a9190>]], dtype=object)

In [203]:
b.kurtosis()


Out[203]:
Months since Last Donation     11.416687
Number of Donations            18.940493
Total Volume Donated (c.c.)    18.940493
Months since First Donation    -0.192772
Made Donation in March 2007    -0.504993
dtype: float64

In [204]:
b.skew()


Out[204]:
Months since Last Donation     2.071107
Number of Donations            3.503331
Total Volume Donated (c.c.)    3.503331
Months since First Donation    0.767706
Made Donation in March 2007    1.223427
dtype: float64

In [205]:
#everything is heavily skewed, well, except months since first donation
#as we have discussed Total Volume Donated (c.c.) is equal to Number of Donations
#in some cases their are major outliers, perhaps we could cap the value at something much lower and set higher values to that
b1 = b[['Months since Last Donation', 'Number of Donations','Months since First Donation',
        'Made Donation in March 2007']]

In [206]:
b1.columns.values


Out[206]:
array(['Months since Last Donation', 'Number of Donations',
       'Months since First Donation', 'Made Donation in March 2007'], dtype=object)

In [207]:
b1[b1['Months since Last Donation'] > 30]
#note people with really big last donations it's equal to their first donation, maybe they donated once and then
#were like "I hate this" and never donated again


Out[207]:
Months since Last Donation Number of Donations Months since First Donation Made Donation in March 2007
673 35 3 64 0
350 74 1 74 0
541 39 1 39 0
74 72 1 72 0

In [208]:
#b1[(b1['Months since Last Donation']==b1['Months since First Donation'])&(b1['Made Donation in March 2007']==1)]
#.count()

In [209]:
#print float(18)/float(144)

In [210]:
#b1.keys()

In [212]:
b1['Months since Last Donation Outliers Removed'] = b1['Months since Last Donation']
b1.loc[b1['Months since Last Donation'] > 35, 'Months since Last Donation Outliers Removed'] = 35
#b1[b1['Months since Last Donation Outliers Removed'].apply(lambda x: x > 35)] = 35


/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  if __name__ == '__main__':
/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame
  from ipykernel import kernelapp as app

In [213]:
b1[b1['Months since Last Donation Outliers Removed'] > 30]


Out[213]:
Months since Last Donation Number of Donations Months since First Donation Made Donation in March 2007 Months since Last Donation Outliers Removed
673 35 3 64 0 35
350 74 1 74 0 35
541 39 1 39 0 35
74 72 1 72 0 35

In [214]:
b1[b1['Number of Donations']>30]


Out[214]:
Months since Last Donation Number of Donations Months since First Donation Made Donation in March 2007 Months since Last Donation Outliers Removed
619 2 50 98 1 2
736 5 46 98 1 5
451 23 38 98 0 23
39 2 43 86 1 2
149 2 44 98 0 2
157 4 33 98 1 4

In [215]:
##okay, need to save this to remember how it works, but sqrt didn't normalize quite as much as I hoped... though way better
b1['Months since Last Donation square root'] = b1['Months since Last Donation Outliers Removed'].apply(math.sqrt)
#b1['Months since Last Donation log'] = (b1['Months since Last Donation']+1).apply(numpy.log)
#b1['Months since Last Donation reciprocal'] = 1/(b1['Months since Last Donation']+1)
#oh yea, log and reciprocal won't work well because there are zeroes, would need to add one to all values before proceeding
#cool, so sqrt wins anyway... 

#b1['Number of Donations square root'] = b1['Number of Donations'].apply(math.sqrt)
b1['Number of Donations log'] = (b1['Number of Donations']+1).apply(numpy.log)
#b1['Number of Donations reciprocal'] = 1/(b1['Number of Donations']+1)
#log wins here

In [216]:
b1.hist(figsize=(20,20))


Out[216]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x120089b10>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1205a4850>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12112b310>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x12131a2d0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x121390f50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1213ecbd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x12147cad0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x121601790>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x121670350>]], dtype=object)

In [217]:
print "SKEW"
print b1.skew()
print ""
print "KURTOSIS"
print b1.kurtosis()


SKEW
Months since Last Donation                     2.071107
Number of Donations                            3.503331
Months since First Donation                    0.767706
Made Donation in March 2007                    1.223427
Months since Last Donation Outliers Removed    0.742492
Months since Last Donation square root         0.234546
Number of Donations log                        0.446914
dtype: float64

KURTOSIS
Months since Last Donation                     11.416687
Number of Donations                            18.940493
Months since First Donation                    -0.192772
Made Donation in March 2007                    -0.504993
Months since Last Donation Outliers Removed    -0.305992
Months since Last Donation square root         -1.160537
Number of Donations log                        -0.254753
dtype: float64

In [218]:
#also, is there a way to get odds ratios or percent liklihood from logistic regression in python? there is in R so it seems likely...
b1.keys()


Out[218]:
Index([u'Months since Last Donation', u'Number of Donations', u'Months since First Donation', u'Made Donation in March 2007', u'Months since Last Donation Outliers Removed', u'Months since Last Donation square root', u'Number of Donations log'], dtype='object')

In [219]:
b2 = b1[['Months since Last Donation square root', 'Number of Donations log', 'Months since First Donation', 'Made Donation in March 2007']]

In [220]:
b2[:5]


Out[220]:
Months since Last Donation square root Number of Donations log Months since First Donation Made Donation in March 2007
619 1.414214 3.931826 98 1
664 0.000000 2.639057 28 1
441 1.000000 2.833213 35 1
160 1.414214 3.044522 45 1
358 1.000000 3.218876 77 0

In [221]:
b2.keys()


Out[221]:
Index([u'Months since Last Donation square root', u'Number of Donations log', u'Months since First Donation', u'Made Donation in March 2007'], dtype='object')

In [222]:
pred = b2[['Months since Last Donation square root', 'Number of Donations log', 'Months since First Donation']]
outcomes = b2[['Made Donation in March 2007']]
fit1 = linear_model.LinearRegression()

In [223]:
fit1.fit(pred, outcomes)


Out[223]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [224]:
fit2 = linear_model.LogisticRegression()
fit2.fit(pred, outcomes)


Out[224]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [225]:
fit2.coef_


Out[225]:
array([[-0.45450825,  1.38811617, -0.02827812]])

In [226]:
v= pd.DataFrame(fit2.predict_proba(pred))

In [227]:
v.describe()


Out[227]:
0 1
count 576.000000 576.000000
mean 0.757990 0.242010
std 0.166845 0.166845
min 0.186097 0.005404
25% 0.651773 0.095229
50% 0.793474 0.206526
75% 0.904771 0.348227
max 0.994596 0.813903

In [228]:
v[:5]


Out[228]:
0 1
0 0.343475 0.656525
1 0.186097 0.813903
2 0.251124 0.748876
3 0.285997 0.714003
4 0.391671 0.608329

In [229]:
bt = pd.DataFrame.from_csv('../data/raw/blood_test.csv')

In [230]:
#need to to do square-rooting and logging and what not
bt[:5]
#could just redo by hand, or make a function that does above?
#vt= pd.DataFrame(fit2.predict_proba(bt))


Out[230]:
Months since Last Donation Number of Donations Total Volume Donated (c.c.) Months since First Donation
659 2 12 3000 52
276 21 7 1750 38
263 4 1 250 4
303 11 11 2750 38
83 4 12 3000 34

In [231]:
bt.count()


Out[231]:
Months since Last Donation     200
Number of Donations            200
Total Volume Donated (c.c.)    200
Months since First Donation    200
dtype: int64

In [232]:
b2.iloc[1:4, 0:3]


Out[232]:
Months since Last Donation square root Number of Donations log Months since First Donation
664 0.000000 2.639057 28
441 1.000000 2.833213 35
160 1.414214 3.044522 45

In [233]:
X = b2.iloc[:,0:3].as_matrix()
y = list(b2["Made Donation in March 2007"])

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

clf   = linear_model.LogisticRegression()
score = sklearn.model_selection.cross_val_score( 
    clf, 
    X, y,
    scoring="neg_log_loss")

print(score)


[-0.57866836 -0.50813732 -0.48442203]

In [234]:
bt[1:5]


Out[234]:
Months since Last Donation Number of Donations Total Volume Donated (c.c.) Months since First Donation
276 21 7 1750 38
263 4 1 250 4
303 11 11 2750 38
83 4 12 3000 34

In [241]:
bt1 = bt[['Months since Last Donation', 'Number of Donations','Months since First Donation']]
bt1['Months since Last Donation Outliers Removed'] = bt1['Months since Last Donation']
bt1.loc[bt1['Months since Last Donation'] > 35, 'Months since Last Donation Outliers Removed'] = 35


/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame
  app.launch_new_instance()

In [242]:
bt1[1:5]


Out[242]:
Months since Last Donation Number of Donations Months since First Donation Months since Last Donation Outliers Removed
276 21 7 38 21
263 4 1 4 4
303 11 11 38 11
83 4 12 34 4

In [243]:
bt1['Months since Last Donation square root'] = bt1['Months since Last Donation Outliers Removed'].apply(math.sqrt)

bt1['Number of Donations log'] = (bt1['Number of Donations']+1).apply(numpy.log)

In [244]:
bt2 = bt1[['Months since Last Donation square root', 'Number of Donations log', 'Months since First Donation']]

In [245]:
bt2[1:5]


Out[245]:
Months since Last Donation square root Number of Donations log Months since First Donation
276 4.582576 2.079442 38
263 2.000000 0.693147 4
303 3.316625 2.484907 38
83 2.000000 2.564949 34

In [246]:
clf.fit(X,y)


Out[246]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [247]:
clf.predict_proba(bt2)


Out[247]:
array([[ 0.48718888,  0.51281112],
       [ 0.84115388,  0.15884612],
       [ 0.81090761,  0.18909239],
       [ 0.62916139,  0.37083861],
       [ 0.42702072,  0.57297928],
       [ 0.28498733,  0.71501267],
       [ 0.70952654,  0.29047346],
       [ 0.92623866,  0.07376134],
       [ 0.98913085,  0.01086915],
       [ 0.9353713 ,  0.0646287 ],
       [ 0.8267581 ,  0.1732419 ],
       [ 0.85447363,  0.14552637],
       [ 0.74721038,  0.25278962],
       [ 0.81090761,  0.18909239],
       [ 0.69128186,  0.30871814],
       [ 0.9048472 ,  0.0951528 ],
       [ 0.71858898,  0.28141102],
       [ 0.5490258 ,  0.4509742 ],
       [ 0.92030364,  0.07969636],
       [ 0.68334093,  0.31665907],
       [ 0.49307267,  0.50692733],
       [ 0.64418143,  0.35581857],
       [ 0.56488066,  0.43511934],
       [ 0.81090761,  0.18909239],
       [ 0.76655514,  0.23344486],
       [ 0.74745776,  0.25254224],
       [ 0.6952517 ,  0.3047483 ],
       [ 0.92623866,  0.07376134],
       [ 0.69700726,  0.30299274],
       [ 0.48718888,  0.51281112],
       [ 0.72667235,  0.27332765],
       [ 0.9599853 ,  0.0400147 ],
       [ 0.75641374,  0.24358626],
       [ 0.74242273,  0.25757727],
       [ 0.87428357,  0.12571643],
       [ 0.69782257,  0.30217743],
       [ 0.84199285,  0.15800715],
       [ 0.92623866,  0.07376134],
       [ 0.59763309,  0.40236691],
       [ 0.81090761,  0.18909239],
       [ 0.65177281,  0.34822719],
       [ 0.81795835,  0.18204165],
       [ 0.94285762,  0.05714238],
       [ 0.57121026,  0.42878974],
       [ 0.88583201,  0.11416799],
       [ 0.75641374,  0.24358626],
       [ 0.61462171,  0.38537829],
       [ 0.89678193,  0.10321807],
       [ 0.92623866,  0.07376134],
       [ 0.80237255,  0.19762745],
       [ 0.47290016,  0.52709984],
       [ 0.54439953,  0.45560047],
       [ 0.81948796,  0.18051204],
       [ 0.66966536,  0.33033464],
       [ 0.95792316,  0.04207684],
       [ 0.78982164,  0.21017836],
       [ 0.79078082,  0.20921918],
       [ 0.53217142,  0.46782858],
       [ 0.68877043,  0.31122957],
       [ 0.49996354,  0.50003646],
       [ 0.81090761,  0.18909239],
       [ 0.92077753,  0.07922247],
       [ 0.75574513,  0.24425487],
       [ 0.62821077,  0.37178923],
       [ 0.92077753,  0.07922247],
       [ 0.920744  ,  0.079256  ],
       [ 0.91399925,  0.08600075],
       [ 0.79506758,  0.20493242],
       [ 0.64542481,  0.35457519],
       [ 0.81090761,  0.18909239],
       [ 0.80312721,  0.19687279],
       [ 0.83621088,  0.16378912],
       [ 0.9851861 ,  0.0148139 ],
       [ 0.81090761,  0.18909239],
       [ 0.94332504,  0.05667496],
       [ 0.86136629,  0.13863371],
       [ 0.87696757,  0.12303243],
       [ 0.74294437,  0.25705563],
       [ 0.87136503,  0.12863497],
       [ 0.53217142,  0.46782858],
       [ 0.9851861 ,  0.0148139 ],
       [ 0.62916139,  0.37083861],
       [ 0.6952517 ,  0.3047483 ],
       [ 0.65177281,  0.34822719],
       [ 0.47290016,  0.52709984],
       [ 0.95732168,  0.04267832],
       [ 0.9048472 ,  0.0951528 ],
       [ 0.91865571,  0.08134429],
       [ 0.79506758,  0.20493242],
       [ 0.63178315,  0.36821685],
       [ 0.81090761,  0.18909239],
       [ 0.90468807,  0.09531193],
       [ 0.88583201,  0.11416799],
       [ 0.89486613,  0.10513387],
       [ 0.71008085,  0.28991915],
       [ 0.65514518,  0.34485482],
       [ 0.61495789,  0.38504211],
       [ 0.72771786,  0.27228214],
       [ 0.91177474,  0.08822526],
       [ 0.70952654,  0.29047346],
       [ 0.75641374,  0.24358626],
       [ 0.86322911,  0.13677089],
       [ 0.92718108,  0.07281892],
       [ 0.75045451,  0.24954549],
       [ 0.86186176,  0.13813824],
       [ 0.52889339,  0.47110661],
       [ 0.82378974,  0.17621026],
       [ 0.52393181,  0.47606819],
       [ 0.81090761,  0.18909239],
       [ 0.86801194,  0.13198806],
       [ 0.84382018,  0.15617982],
       [ 0.81090761,  0.18909239],
       [ 0.9599853 ,  0.0400147 ],
       [ 0.51168762,  0.48831238],
       [ 0.62916139,  0.37083861],
       [ 0.9529663 ,  0.0470337 ],
       [ 0.96317049,  0.03682951],
       [ 0.5523711 ,  0.4476289 ],
       [ 0.36510234,  0.63489766],
       [ 0.76208147,  0.23791853],
       [ 0.86322911,  0.13677089],
       [ 0.46106651,  0.53893349],
       [ 0.83627382,  0.16372618],
       [ 0.26310167,  0.73689833],
       [ 0.54503631,  0.45496369],
       [ 0.59782917,  0.40217083],
       [ 0.71189772,  0.28810228],
       [ 0.84415173,  0.15584827],
       [ 0.83627382,  0.16372618],
       [ 0.87733852,  0.12266148],
       [ 0.6061722 ,  0.3938278 ],
       [ 0.9048472 ,  0.0951528 ],
       [ 0.80269526,  0.19730474],
       [ 0.81090761,  0.18909239],
       [ 0.74242273,  0.25757727],
       [ 0.70952654,  0.29047346],
       [ 0.9048472 ,  0.0951528 ],
       [ 0.95804072,  0.04195928],
       [ 0.32746159,  0.67253841],
       [ 0.77721981,  0.22278019],
       [ 0.32759313,  0.67240687],
       [ 0.81090761,  0.18909239],
       [ 0.6952517 ,  0.3047483 ],
       [ 0.79078082,  0.20921918],
       [ 0.81090761,  0.18909239],
       [ 0.96317049,  0.03682951],
       [ 0.9334603 ,  0.0665397 ],
       [ 0.67939269,  0.32060731],
       [ 0.96784826,  0.03215174],
       [ 0.75641374,  0.24358626],
       [ 0.95743306,  0.04256694],
       [ 0.36925666,  0.63074334],
       [ 0.65482439,  0.34517561],
       [ 0.83807394,  0.16192606],
       [ 0.86139653,  0.13860347],
       [ 0.73306738,  0.26693262],
       [ 0.93727827,  0.06272173],
       [ 0.75641374,  0.24358626],
       [ 0.54439953,  0.45560047],
       [ 0.9048472 ,  0.0951528 ],
       [ 0.75574513,  0.24425487],
       [ 0.92623866,  0.07376134],
       [ 0.56321542,  0.43678458],
       [ 0.50391923,  0.49608077],
       [ 0.82989092,  0.17010908],
       [ 0.93723635,  0.06276365],
       [ 0.6952517 ,  0.3047483 ],
       [ 0.72667235,  0.27332765],
       [ 0.81090761,  0.18909239],
       [ 0.68334093,  0.31665907],
       [ 0.96317049,  0.03682951],
       [ 0.98598926,  0.01401074],
       [ 0.81090761,  0.18909239],
       [ 0.4065287 ,  0.5934713 ],
       [ 0.90468807,  0.09531193],
       [ 0.93270528,  0.06729472],
       [ 0.59119204,  0.40880796],
       [ 0.76218628,  0.23781372],
       [ 0.94525932,  0.05474068],
       [ 0.81090761,  0.18909239],
       [ 0.92623866,  0.07376134],
       [ 0.79403137,  0.20596863],
       [ 0.46106651,  0.53893349],
       [ 0.93727827,  0.06272173],
       [ 0.58700881,  0.41299119],
       [ 0.87733852,  0.12266148],
       [ 0.87696757,  0.12303243],
       [ 0.9048472 ,  0.0951528 ],
       [ 0.75641374,  0.24358626],
       [ 0.87733852,  0.12266148],
       [ 0.38546809,  0.61453191],
       [ 0.78920914,  0.21079086],
       [ 0.89486613,  0.10513387],
       [ 0.9161873 ,  0.0838127 ],
       [ 0.86136629,  0.13863371],
       [ 0.93727827,  0.06272173],
       [ 0.93112959,  0.06887041],
       [ 0.86134348,  0.13865652],
       [ 0.94492913,  0.05507087],
       [ 0.66048024,  0.33951976]])

In [248]:
p = clf.predict_proba(bt2)

In [249]:
bt2['Made Donation in March 2007'] = p[:,1]
bt2.head()


Out[249]:
Months since Last Donation square root Number of Donations log Months since First Donation Made Donation in March 2007
659 1.414214 2.564949 52 0.512811
276 4.582576 2.079442 38 0.158846
263 2.000000 0.693147 4 0.189092
303 3.316625 2.484907 38 0.370839
83 2.000000 2.564949 34 0.572979

In [256]:
submit_dir = '../data/processed/'
submit_filename = 'submit_cleaned_logistic_regression.csv'
bt2.to_csv(submit_dir+submit_filename, 
         #  columns =('Unnamed: 0', 'Made Donation in March 2007'), 
           index=True)

In [ ]: