In [2]:
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess
In [3]:
%%bash
python --version
In [4]:
import os
bloodPath = os.path.abspath(os.path.join(os.getcwd(),'..'))
trainPath = os.path.join(bloodPath,'data','raw','blood_train.csv')
testPath = os.path.join(bloodPath,'data','raw','blood_train.csv')
df = pd.DataFrame.from_csv(trainPath)
df.columns = [c.replace(' ', '_') for c in df.columns]
dt = pd.DataFrame.from_csv(testPath)
dt.columns = [c.replace(' ', '_') for c in dt.columns]
In [5]:
#Show the first few lines of the database
df[:5]
Out[5]:
In [6]:
dt[:5]
Out[6]:
In [7]:
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])
Out[7]:
In [16]:
df['MLDCount'] = df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').transform(lambda x: x.count())
df[['Made_Donation_in_March_2007','MLDCount']].groupby('MLDCount').mean()
df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').count()
Out[16]:
In [23]:
predLabels = ['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations']
def getCrossTabExact(predLabels):
dtP = dt
dtP['Index'] = dtP.index
dfP = df.groupby(predLabels).mean().reset_index()
dfP = dfP.rename(columns = {'Made_Donation_in_March_2007': 'Estimate'})
dfP['Weight'] = df.groupby(predLabels).count().reset_index()['Made_Donation_in_March_2007']
dtP = pd.merge(dtP,dfP,how ='left',on=predLabels).set_index('Index')
dtP = dtP[['Estimate','Weight']]
return dtP
dtTri = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations'])
dtPair1 = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation'])
dtPair2 = getCrossTabExact(['Months_since_Last_Donation','Number_of_Donations'])
dtPair3 = getCrossTabExact(['Months_since_First_Donation','Number_of_Donations'])
dtMSLD = getCrossTabExact(['Months_since_Last_Donation'])
dtMSFD = getCrossTabExact(['Months_since_First_Donation'])
dtND = getCrossTabExact(['Number_of_Donations'])
In [20]:
dtTri.sort_values('Weight')
Out[20]:
In [24]:
out
Out[24]:
In [10]:
def meanRevertBins(predictorLabel,actualLabel,meanRevCount):
dataMean = df[[predictorLabel,actualLabel]].groupby([predictorLabel]).mean()
dataCount = df[[predictorLabel,actualLabel]].groupby([predictorLabel]).count()
pred = (dataMean*dataCount+np.mean(dataMean)*meanRevCount)/(dataCount+meanRevCount)
x = pred.index
y = pred.values
return x,y
In [11]:
# From http://stackoverflow.com/questions/18517722/weighted-moving-average-in-python
def uniPredict(x,y,testx,fillval=0):
pred = np.array([])
for test in testx:
if test in x:
pred = np.append(pred,y[test==x])
else:
pred = np.append(pred,fillval)
return pred
In [12]:
actualLabel = 'Made_Donation_in_March_2007'
meanRevCount = 3
predictorLabel = 'Months_since_Last_Donation'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])
testx = df[predictorLabel]
pred1 = uniPredict(x,y,testx,fillval=fillval)
predictorLabel = 'Months_since_First_Donation'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])
testx = df[predictorLabel]
pred2 = uniPredict(x,y,testx,fillval=fillval)
predictorLabel = 'Number_of_Donations'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])
testx = df[predictorLabel]
pred3 = uniPredict(x,y,testx,fillval=fillval)
predictorLabel = 'MLDCount'
x,y = meanRevertBins(predictorLabel,actualLabel,meanRevCount)
fillval = np.mean(df[actualLabel])
testx = df[predictorLabel]
pred4 = uniPredict(x,y,testx,fillval=fillval)
multipred = (pred1+pred2*3+pred3+pred4)/6
In [13]:
# Training evaluation
from sklearn.metrics import log_loss
actual = df[actualLabel]
print 'Training log-loss score ' + str(log_loss(actual,pred1))
print 'Training log-loss score ' + str(log_loss(actual,pred2))
print 'Training log-loss score ' + str(log_loss(actual,pred3))
print 'Training log-loss score ' + str(log_loss(actual,pred4))
print 'Training log-loss score ' + str(log_loss(actual,multipred))
In [14]:
print 'Training log-loss score ' + str(log_loss(actual,np.array(dtTri['Estimate'])))