In [80]:
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess
In [81]:
%%bash
python --version
In [82]:
import os
bloodPath = os.path.abspath(os.path.join(os.getcwd(),'..'))
trainPath = os.path.join(bloodPath,'data','raw','blood_train.csv')
testPath = os.path.join(bloodPath,'data','raw','blood_train.csv')
df = pd.DataFrame.from_csv(trainPath)
df.columns = [c.replace(' ', '_') for c in df.columns]
dt = pd.DataFrame.from_csv(testPath)
dt.columns = [c.replace(' ', '_') for c in dt.columns]
In [83]:
#Show the first few lines of the database
df[:5]
Out[83]:
In [84]:
dt[:5]
Out[84]:
In [85]:
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])
Out[85]:
In [86]:
df['MLDCount'] = df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').transform(lambda x: x.count())
df[['Made_Donation_in_March_2007','MLDCount']].groupby('MLDCount').mean()
df[['Made_Donation_in_March_2007','Months_since_Last_Donation']].groupby('Months_since_Last_Donation').count()
Out[86]:
In [91]:
predLabels = ['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations']
def getCrossTabExact(predLabels):
dtP = dt
dtP['Index'] = dtP.index
dfP = df.groupby(predLabels).mean().reset_index()
dfP = dfP.rename(columns = {'Made_Donation_in_March_2007': 'Estimate'})
dfP['Weight'] = df.groupby(predLabels).count().reset_index()['Made_Donation_in_March_2007']*len(predLabels)
dtP = pd.merge(dtP,dfP,how ='left',on=predLabels).set_index('Index')
dtP = dtP[['Estimate','Weight']]
dtP.fillna(0)
return dtP
dtTri = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation','Number_of_Donations'])
dtPair1 = getCrossTabExact(['Months_since_Last_Donation','Months_since_First_Donation'])
dtPair2 = getCrossTabExact(['Months_since_Last_Donation','Number_of_Donations'])
dtPair3 = getCrossTabExact(['Months_since_First_Donation','Number_of_Donations'])
dtMSLD = getCrossTabExact(['Months_since_Last_Donation'])
dtMSFD = getCrossTabExact(['Months_since_First_Donation'])
dtND = getCrossTabExact(['Number_of_Donations'])
In [124]:
def getCrossTabPred(crossTabs,meanVal,meanRevertWeight):
weights = np.zeros(len(crosstabs[0]))
wEst = np.zeros(len(crosstabs[0]))
for ct in crossTabs:
wEst = (ct.Estimate*ct.Weight)+wEst
weights = ct.Weight+weights
wEst = meanVal*meanRevertWeight + wEst
weights = weights + meanRevertWeight
pred = wEst/weights
return pred
In [125]:
meanVal = np.mean(df['Made_Donation_in_March_2007'])
meanRevertWeight = 10
crossTabs = [dtTri,dtPair1,dtPair2 ,dtPair3,dtMSLD,dtMSFD,dtND]
pred = getCrossTabPred(crossTabs,meanVal,meanRevertWeight)
In [126]:
pred
Out[126]:
In [127]:
# Training evaluation
from sklearn.metrics import log_loss
actual = df[actualLabel]
print 'Training log-loss score ' + str(log_loss(actual,np.array(pred)))
In [129]:
X = df.as_matrix()
y = list(df["Made_Donation_in_March_2007"])
In [132]:
y
Out[132]:
In [77]:
finalPred = pred
dt['Final'] = finalPred
In [53]:
print 'Training log-loss score ' + str(log_loss(actual,finalPred))
In [79]:
dt['Final'].to_csv('try.csv')
In [ ]: