In [74]:
import math
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as npalg
import pandas as pd
import prettyplotlib as ppl
import random
from scipy.stats import gaussian_kde
import seaborn as sns
import statsmodels.formula.api as sm
from sklearn import linear_model
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
%matplotlib inline
In [87]:
###
df = pd.io.json.read_json('data/TopicTrain.json')
df.head()
Out[87]:
In [88]:
pizza = df[df['requester_received_pizza']]
nopizza = df[df['requester_received_pizza'] == False]
In [89]:
len(df.columns.values.tolist())
Out[89]:
In [90]:
def error(predictions, truth):
mse = 0.0
for x, y in zip(predictions, truth):
mse += (x - y) ** 2
return mse / len(truth)
In [91]:
TEST_FRACTION = 0.2
pizza_indices = range(len(pizza))
nopizza_indices = range(len(nopizza))
random.shuffle(pizza_indices)
random.shuffle(nopizza_indices)
pizza_cutoff = int(len(pizza) * TEST_FRACTION)
nopizza_cutoff = int(len(nopizza) * TEST_FRACTION)
df_train = pd.concat([pizza.iloc[pizza_indices[pizza_cutoff:]], nopizza.iloc[nopizza_indices[nopizza_cutoff:]]])
df_test = pd.concat([pizza.iloc[pizza_indices[:pizza_cutoff]], nopizza.iloc[nopizza_indices[:nopizza_cutoff]]])
y_test = np.array(df_test['requester_received_pizza'])
In [92]:
def build_and_test(model_class):
model = model_class(df_train)
y = model.predict(df_test)
print 'Test error:', error(y, y_test)
fpr, tpr, thresholds = roc_curve(y_test, y)
roc_auc = auc(fpr, tpr)
print "Area under the ROC curve : %f" % roc_auc
# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()
return model
In [93]:
class Model(object):
def __init__(self, df_train):
self.X_train = self.extract_features(df_train.copy())
self.y_train = np.array(df_train['requester_received_pizza'])
self.train(self.X_train, self.y_train)
def extract_features(self, df):
raise NotImplementedError
def train(self, X, y):
raise NotImplementedError
def predict(self, df_test):
self.X_test = self.extract_features(df_test.copy())
self.y_test = self.predict_internal(self.X_test)
return self.y_test
def predict_internal(self, X):
raise NotImplementedError
In [94]:
df.columns.values.tolist()
Out[94]:
In [95]:
def safelog10(value, negval=-1):
if value <= 0:
return negval
return math.log10(value)
class NonTopicFeatures(Model):
def extract_features(self, df):
#df['log_textlen'] = df['request_text_edit_aware'].apply(lambda x: safelog10(len(x)))
df['log_age'] = df['requester_account_age_in_days_at_request'].apply(safelog10)
df['log_posts'] = df['requester_number_of_comments_at_request'].apply(safelog10)
df['log_posts_raop'] = df['requester_number_of_comments_in_raop_at_request'].apply(safelog10)
return np.array(df[
['log_age', 'log_posts', 'log_posts_raop', 'giver_user_name_length','link_presence?','log_request_length','log_title_length','semi_colon?']])
class TopicFeatures(Model):
def extract_features(self, df):
#df['log_textlen'] = df['request_text_edit_aware'].apply(lambda x: safelog10(len(x)))
df['log_age'] = df['requester_account_age_in_days_at_request'].apply(safelog10)
df['log_posts'] = df['requester_number_of_comments_at_request'].apply(safelog10)
df['log_posts_raop'] = df['requester_number_of_comments_in_raop_at_request'].apply(safelog10)
return np.array(df[
['log_age', 'log_posts', 'log_posts_raop', 'giver_user_name_length','link_presence?',
'log_request_length','log_title_length','semi_colon?','T:money1','T:money2','T:Job','T:Friend',
'T:Student','T:Familytime','T:Time','T:gratitude','T:Pizza','T:General'
]])
In [96]:
class LinearRegression(TopicFeatures):
def train(self, X, y):
model = linear_model.LinearRegression()
model.fit(X, y)
self.model = model
def predict_internal(self, X):
return self.model.predict(X)
model = build_and_test(LinearRegression)
In [67]:
class LassoRegression(TopicFeatures):
def train(self, X, y):
model = linear_model.Lasso( alpha=.1)
model.fit(X, y)
self.model = model
def predict_internal(self, X):
return self.model.predict(X)
model = build_and_test(LassoRegression)
In [86]:
class LogisticRegression(TopicFeatures):
def train(self, X, y):
model = linear_model.LogisticRegression()
model.fit(X, y)
self.model = model
def predict_internal(self, X):
return self.model.predict_log_proba(X)[:,1]
model = build_and_test(LogisticRegression)
In [69]:
class RandomForestc(TopicFeatures):
def train(self, X, y):
model = RandomForestClassifier(n_estimators=50)
model.fit(X, y)
self.model = model
def predict_internal(self, X):
return self.model.predict(X)
model = build_and_test(RandomForestc)
In [70]:
class RandomForest(TopicFeatures):
def train(self, X, y):
model = RandomForestRegressor(n_estimators=30)
model.fit(X, y)
self.model = model
def predict_internal(self, X):
return self.model.predict(X)
model = build_and_test(RandomForest)
In [97]:
# Generate predictions on the competition's test set
df_realtest = pd.io.json.read_json('TopicTest1.json')
print len(df_realtest), 'test samples'
In [98]:
#Training and writing out to csv
model = LinearRegression(df) # Now train on the full labelled dataset
filename = 'Topic_predictions1.csv'
results = zip(df_realtest['request_id'],
model.predict(df_realtest))
f = open(filename, 'w')
print >>f, 'request_id, requester_received_pizza'
for a, b in results:
print >>f, '%s,%f' % (a, b)
f.close()
In [64]:
np.exp([2,2])
Out[64]:
In [ ]: