In [14]:
# Import stuff
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import scipy
from scipy.stats import pearsonr
from __future__ import division
import matplotlib.pyplot as plt
from statsmodels.nonparametric.smoothers_lowess import lowess
In [15]:
import os
bloodPath = os.path.abspath(os.path.join(os.getcwd(),'..'))
trainPath = os.path.join(bloodPath,'data','raw','blood_train.csv')
print trainPath
In [16]:
df = pd.DataFrame.from_csv(trainPath)
df.columns = [c.replace(' ', '_') for c in df.columns]
In [17]:
#Show the first few lines of the database
df[:5]
Out[17]:
In [18]:
pearsonr(df['Number_of_Donations'],df['Total_Volume_Donated_(c.c.)'])
Out[18]:
In [19]:
data = df[['Months_since_Last_Donation','Made_Donation_in_March_2007']].groupby(['Months_since_Last_Donation']).mean().plot(kind = 'bar')
In [76]:
# From http://stackoverflow.com/questions/18517722/weighted-moving-average-in-python
def weighted_moving_average(x,y,step_size=0.05,width=1,bin_range =[0, 1]):
bin_centers = np.arange(bin_range[0],bin_range[1])
bin_avg = np.zeros(len(bin_centers))
#We're going to weight with a Gaussian function
def gaussian(x,amp=1,mean=0,sigma=1):
return amp*np.exp(-(x-mean)**2/(2*sigma**2))
for index in range(0,len(bin_centers)):
bin_center = bin_centers[index]
weights = gaussian(x,mean=bin_center,sigma=width)
bin_avg[index] = np.average(y,weights=weights)
return (bin_centers,bin_avg)
df = df.sort_values('Months_since_Last_Donation')
x = df['Months_since_Last_Donation'].values #.apply(lambda x: np.log(x)).
y = df['Made_Donation_in_March_2007'].values
x_outLD,y_outLD = weighted_moving_average(x,y,step_size = 1,width = 5, bin_range = [0, 100])
#print smoothed
plt.plot(x_outLD,y_outLD)
fLD = scipy.interpolate.interp1d(x_outLD,y_outLD)
df = df.sort_values('Number_of_Donations')
x = df['Number_of_Donations'].values #.apply(lambda x: np.log(x)).
y = df['Made_Donation_in_March_2007'].values
x_outND,y_outND = weighted_moving_average(x,y,step_size = 1,width = 5, bin_range = [0, 100])
#print smoothed
plt.plot(x_outND,y_outND)
fND = scipy.interpolate.interp1d(x_outND,y_outND)
print x_outND
df = df.sort_values('Months_since_First_Donation')
x = df['Months_since_First_Donation'].values #.apply(lambda x: np.log(x)).
y = df['Made_Donation_in_March_2007'].values
x_outFD,y_outFD = weighted_moving_average(x,y,step_size = 1,width = 5, bin_range = [0, 100])
#print smoothed
plt.plot(x_outFD,y_outFD)
fFD = scipy.interpolate.interp1d(x_outFD,y_outFD)
print x_outFD
#plt.bar(data.index,data.Made_Donation_in_March_2007)
In [54]:
print x
Since Total Volume Donated adds no information, I won't include it as a feature.
In [55]:
df.drop('Total_Volume_Donated_(c.c.)', axis = 1)
model = smf.ols('Made_Donation_in_March_2007 ~ Months_since_Last_Donation + Number_of_Donations + Months_since_First_Donation', data = df)
result = model.fit()
result.summary()
result.fittedvalues
Out[55]:
In [56]:
df.keys()
Out[56]:
In [57]:
mean = df['Made_Donation_in_March_2007'].mean()
df['Means'] = np.ones([576,1])*mean
print df['Means']
In [119]:
model = (fLD(df['Months_since_Last_Donation'])*1+fFD(df['Months_since_First_Donation'])*0+fND(df['Number_of_Donations'])*1+df['Means']*0)/3
In [120]:
# Training evaluation
from sklearn.metrics import log_loss
pred = np.array(df.Means)
actual = df['Made_Donation_in_March_2007']
print 'Training log-loss score ' + str(log_loss(actual,pred))
In [121]:
# Training evaluation
from sklearn.metrics import log_loss
pred = np.array(model)
actual = df['Made_Donation_in_March_2007']
print 'Training log-loss score ' + str(log_loss(actual,pred))
In [ ]: