In [327]:
import cvxopt
import math
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as npalg
import pandas as pd
import prettyplotlib as ppl
import random
from scipy.stats import gaussian_kde
import seaborn as sns
import statsmodels.formula.api as sm
from sklearn import svm, linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_curve, auc
%matplotlib inline

In [308]:
df = pd.io.json.read_json('data/train.json')
pizza = df[df['requester_received_pizza']]
nopizza = df[df['requester_received_pizza'] == False]

In [56]:
TEST_FRACTION = 0.2
pizza_indices = range(len(pizza))
nopizza_indices = range(len(nopizza))
random.shuffle(pizza_indices)
random.shuffle(nopizza_indices)
pizza_cutoff = int(len(pizza) * TEST_FRACTION)
nopizza_cutoff = int(len(nopizza) * TEST_FRACTION)
df_train = pd.concat([pizza.iloc[pizza_indices[pizza_cutoff:]], nopizza.iloc[nopizza_indices[nopizza_cutoff:]]])
df_test = pd.concat([pizza.iloc[pizza_indices[:pizza_cutoff]], nopizza.iloc[nopizza_indices[:nopizza_cutoff]]])
y_test = np.array(df_test['requester_received_pizza'])

In [57]:
df_train.shape, df_test.shape


Out[57]:
((3233, 32), (807, 32))

In [79]:
def error(predictions, truth):
    mse = 0.0
    for x, y in zip(predictions, truth):
        mse += (x - y) ** 2
    return mse / len(truth)

In [153]:
class Model(object):
    def __init__(self, df_train):
        self.X_train = self.extract_features(df_train.copy())
        self.y_train = np.array(df_train['requester_received_pizza'])
        self.train(self.X_train, self.y_train)
    def extract_features(self, df):
        raise NotImplementedError
    def train(self, X, y):
        raise NotImplementedError
    def predict(self, df_test):
        self.X_test = self.extract_features(df_test.copy())
        self.y_test = self.predict_internal(self.X_test)
        return self.y_test
    def predict_internal(self, X):
        raise NotImplementedError

In [154]:
class DummyNoPizza(Model):
    def extract_features(self, df):
        return np.ones((len(df), 1))
    def train(self, X, y):
        pass
    def predict_internal(self, X):
        return np.zeros(len(X))

class DummyYesPizza(Model):
    def extract_features(self, df):
        return np.ones((len(df), 1))
    def train(self, X, y):
        pass
    def predict_internal(self, X):
        return np.ones(len(X))

class DummyTrueOddsPizza(Model):
    def extract_features(self, df):
        return np.ones((len(df), 1))
    def train(self, X, y):
        self.odds = y.mean()
    def predict_internal(self, X):
        return np.repeat(self.odds, len(X))

In [191]:
model = DummyNoPizza(df_train)
print 'Dummy "Always predict NO pizza (0.0)" model error:', error(model.predict(df_test), y_test)
model = DummyYesPizza(df_train)
print 'Dummy "Always predict YES pizza (1.0)" model error:', error(model.predict(df_test), y_test)
model = DummyTrueOddsPizza(df_train)
print 'Dummy "Always predict avg pizza odds (%.3f)" model error:' % df_train['requester_received_pizza'].mean(), error(model.predict(df_test), y_test)


Dummy "Always predict NO pizza (0.0)" model error: 0.245353159851
Dummy "Always predict YES pizza (1.0)" model error: 0.754646840149
Dummy "Always predict avg pizza odds (0.246)" model error: 0.185155722606

In [330]:
def build_and_test(model_class):
    model = model_class(df_train)
    y = model.predict(df_test)
    print 'Test error:', error(y, y_test)
    fpr, tpr, thresholds = roc_curve(y_test, y)
    roc_auc = auc(fpr, tpr)
    print "Area under the ROC curve : %f" % roc_auc
    # Plot ROC curve
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
    return model

In [334]:
build_and_test(DummyTrueOddsPizza)


Test error: 0.185155722606
Area under the ROC curve : 0.500000
Out[334]:
<__main__.DummyTrueOddsPizza at 0x1158516d0>

In [202]:
def safelog10(value, negval=-1):
    if value <= 0:
        return negval
    return math.log10(value)

class SimpleRealFeatures(Model):
    def extract_features(self, df):
        df['log_textlen'] = df['request_text_edit_aware'].apply(lambda x: safelog10(len(x)))
        df['log_age'] = df['requester_account_age_in_days_at_request'].apply(safelog10)
        df['log_posts'] = df['requester_number_of_comments_at_request'].apply(safelog10)
        df['log_posts_raop'] = df['requester_number_of_comments_in_raop_at_request'].apply(safelog10)
        return np.array(df[['log_textlen', 'log_age', 'log_posts', 'log_posts_raop']])

class SimpleBinaryFeatures(Model):
    def extract_features(self, df):
        df['new_user'] = df['requester_account_age_in_days_at_request'] == 0
        df['inactive_user'] = df['requester_number_of_comments_at_request'] == 0
        return np.array(df[['new_user', 'inactive_user']])

In [ ]:


In [331]:
class LinearRegression(SimpleRealFeatures):
    def train(self, X, y):
        model = linear_model.LinearRegression()
        model.fit(X, y)
        self.model = model
    def predict_internal(self, X):
        return self.model.predict(X)

model = build_and_test(LinearRegression)


Test error: 0.178817999271
Area under the ROC curve : 0.628560

In [332]:
class RandomForest(SimpleRealFeatures):
    def train(self, X, y):
        model = RandomForestRegressor(n_estimators=100)
        model.fit(X, y)
        self.model = model
    def predict_internal(self, X):
        return self.model.predict(X)

model = build_and_test(RandomForest)


Test error: 0.205871167029
Area under the ROC curve : 0.587273

In [333]:
class LogisticRegression(SimpleRealFeatures):
    def train(self, X, y):
        model = linear_model.LogisticRegression()
        model.fit(X, y)
        self.model = model
    def predict_internal(self, X):
        return self.model.predict_proba(X)[:,1]

model = build_and_test(LogisticRegression)


Test error: 0.177500163067
Area under the ROC curve : 0.635053

In [335]:
class NewUserModel(SimpleBinaryFeatures):
    def train(self, X, y):
        pass
    def predict_internal(self, X):
        return np.array([.16 if r[0] else .27 for r in X])

model = build_and_test(NewUserModel)


Test error: 0.18340842627
Area under the ROC curve : 0.548092

In [326]:
# Generate predictions on the competition's test set
df_realtest = pd.io.json.read_json('data/test.json')
print len(df_realtest), 'test samples'


1631 test samples

In [325]:
model = LogisticRegression(df)  # Now train on the full labelled dataset
filename = 'simple_predictions.csv'
results = zip(df_realtest['request_id'], model.predict(df_realtest))
f = open(filename, 'w')
print >>f, 'request_id,requester_received_pizza'
for a, b in results:
    print >>f, '%s,%f' % (a, b)
f.close()

In [ ]: