In [74]:
import math
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as npalg
import pandas as pd
import prettyplotlib as ppl
import random
from scipy.stats import gaussian_kde
import seaborn as sns
import statsmodels.formula.api as sm
from sklearn import linear_model
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
%matplotlib inline

In [87]:
###
df = pd.io.json.read_json('data/TopicTrain.json')
df.head()


Out[87]:
T:Familytime T:Friend T:General T:Job T:Pizza T:Student T:Time T:gratitude T:money1 T:money2 ... requester_subreddits_at_request requester_upvotes_minus_downvotes_at_request requester_upvotes_minus_downvotes_at_retrieval requester_upvotes_plus_downvotes_at_request requester_upvotes_plus_downvotes_at_retrieval requester_user_flair requester_username semi_colon? unix_timestamp_of_request unix_timestamp_of_request_utc
0 2 2 1 0 0 0 2 1 0 2 ... [] 0 1 0 1 None nickylvst 0 1317852607 1317849007
1 1 0 0 1 1 1 2 0 1 2 ... [AskReddit, Eve, IAmA, MontereyBay, RandomKind... 34 4258 116 11168 None fohacidal 0 1332652424 1332648824
10 2 2 3 2 2 1 5 1 2 1 ... [4chan, AskReddit, IAmA, ImGoingToHellForThis,... 1738 18617 2634 29755 shroom seabass86 0 1358559286 1358559286
100 1 1 1 1 1 1 2 1 2 1 ... [] 0 31 0 41 None amym91 0 1333479224 1333475624
1000 0 2 2 1 2 1 1 1 1 1 ... [FoodstuffsAllAround, IAmA, RandomActsOfCookie... 87 97 183 223 None CharlieWhoop 0 1379171423 1379167823

5 rows × 50 columns


In [88]:
pizza = df[df['requester_received_pizza']]
nopizza = df[df['requester_received_pizza'] == False]

In [89]:
len(df.columns.values.tolist())


Out[89]:
50

In [90]:
def error(predictions, truth):
    mse = 0.0
    for x, y in zip(predictions, truth):
        mse += (x - y) ** 2
    return mse / len(truth)

In [91]:
TEST_FRACTION = 0.2
pizza_indices = range(len(pizza))
nopizza_indices = range(len(nopizza))
random.shuffle(pizza_indices)
random.shuffle(nopizza_indices)
pizza_cutoff = int(len(pizza) * TEST_FRACTION)
nopizza_cutoff = int(len(nopizza) * TEST_FRACTION)
df_train = pd.concat([pizza.iloc[pizza_indices[pizza_cutoff:]], nopizza.iloc[nopizza_indices[nopizza_cutoff:]]])
df_test = pd.concat([pizza.iloc[pizza_indices[:pizza_cutoff]], nopizza.iloc[nopizza_indices[:nopizza_cutoff]]])
y_test = np.array(df_test['requester_received_pizza'])

In [92]:
def build_and_test(model_class):
    model = model_class(df_train)
    y = model.predict(df_test)
    print 'Test error:', error(y, y_test)
    fpr, tpr, thresholds = roc_curve(y_test, y)
    roc_auc = auc(fpr, tpr)
    print "Area under the ROC curve : %f" % roc_auc
    # Plot ROC curve
    plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")
    plt.show()
    return model

In [93]:
class Model(object):
    def __init__(self, df_train):
        self.X_train = self.extract_features(df_train.copy())
        self.y_train = np.array(df_train['requester_received_pizza'])
        self.train(self.X_train, self.y_train)
    def extract_features(self, df):
        raise NotImplementedError
    def train(self, X, y):
        raise NotImplementedError
    def predict(self, df_test):
        self.X_test = self.extract_features(df_test.copy())
        self.y_test = self.predict_internal(self.X_test)
        return self.y_test
    def predict_internal(self, X):
        raise NotImplementedError

In [94]:
df.columns.values.tolist()


Out[94]:
[u'T:Familytime',
 u'T:Friend',
 u'T:General',
 u'T:Job',
 u'T:Pizza',
 u'T:Student',
 u'T:Time',
 u'T:gratitude',
 u'T:money1',
 u'T:money2',
 u'clean_request_text',
 u'clean_title_text',
 u'giver_user_name_length',
 u'giver_username_if_known',
 u'link_presence?',
 u'log_request_length',
 u'log_title_length',
 u'number_of_downvotes_of_request_at_retrieval',
 u'number_of_upvotes_of_request_at_retrieval',
 u'post_was_edited',
 u'request_id',
 u'request_number_of_comments_at_retrieval',
 u'request_text',
 u'request_text_edit_aware',
 u'request_title',
 u'request_user_name_length',
 u'requester_account_age_in_days_at_request',
 u'requester_account_age_in_days_at_retrieval',
 u'requester_days_since_first_post_on_raop_at_request',
 u'requester_days_since_first_post_on_raop_at_retrieval',
 u'requester_number_of_comments_at_request',
 u'requester_number_of_comments_at_retrieval',
 u'requester_number_of_comments_in_raop_at_request',
 u'requester_number_of_comments_in_raop_at_retrieval',
 u'requester_number_of_posts_at_request',
 u'requester_number_of_posts_at_retrieval',
 u'requester_number_of_posts_on_raop_at_request',
 u'requester_number_of_posts_on_raop_at_retrieval',
 u'requester_number_of_subreddits_at_request',
 u'requester_received_pizza',
 u'requester_subreddits_at_request',
 u'requester_upvotes_minus_downvotes_at_request',
 u'requester_upvotes_minus_downvotes_at_retrieval',
 u'requester_upvotes_plus_downvotes_at_request',
 u'requester_upvotes_plus_downvotes_at_retrieval',
 u'requester_user_flair',
 u'requester_username',
 u'semi_colon?',
 u'unix_timestamp_of_request',
 u'unix_timestamp_of_request_utc']

In [95]:
def safelog10(value, negval=-1):
    if value <= 0:
        return negval
    return math.log10(value)

class NonTopicFeatures(Model):
    def extract_features(self, df):
        #df['log_textlen'] = df['request_text_edit_aware'].apply(lambda x: safelog10(len(x)))
        df['log_age'] = df['requester_account_age_in_days_at_request'].apply(safelog10)
        df['log_posts'] = df['requester_number_of_comments_at_request'].apply(safelog10)
        df['log_posts_raop'] = df['requester_number_of_comments_in_raop_at_request'].apply(safelog10)
        return np.array(df[
                           ['log_age', 'log_posts', 'log_posts_raop', 'giver_user_name_length','link_presence?','log_request_length','log_title_length','semi_colon?']])

class TopicFeatures(Model):
    def extract_features(self, df):
        #df['log_textlen'] = df['request_text_edit_aware'].apply(lambda x: safelog10(len(x)))
        df['log_age'] = df['requester_account_age_in_days_at_request'].apply(safelog10)
        df['log_posts'] = df['requester_number_of_comments_at_request'].apply(safelog10)
        df['log_posts_raop'] = df['requester_number_of_comments_in_raop_at_request'].apply(safelog10)
        return np.array(df[
                           ['log_age', 'log_posts', 'log_posts_raop', 'giver_user_name_length','link_presence?',
                            'log_request_length','log_title_length','semi_colon?','T:money1','T:money2','T:Job','T:Friend',
                            'T:Student','T:Familytime','T:Time','T:gratitude','T:Pizza','T:General'
                            
                            ]])

In [96]:
class LinearRegression(TopicFeatures):
    def train(self, X, y):
        model = linear_model.LinearRegression()
        model.fit(X, y)
        self.model = model
    def predict_internal(self, X):
        return self.model.predict(X)
model = build_and_test(LinearRegression)


Test error: 0.147042429789
Area under the ROC curve : 0.738319

In [67]:
class LassoRegression(TopicFeatures):
    def train(self, X, y):
        model = linear_model.Lasso( alpha=.1)
        model.fit(X, y)
        self.model = model
    def predict_internal(self, X):
        return self.model.predict(X)
model = build_and_test(LassoRegression)


Test error: 0.152529716126
Area under the ROC curve : 0.643939

In [86]:
class LogisticRegression(TopicFeatures):
    def train(self, X, y):
        model = linear_model.LogisticRegression()
        model.fit(X, y)
        self.model = model
    def predict_internal(self, X):
        return self.model.predict_log_proba(X)[:,1]

model = build_and_test(LogisticRegression)


Test error: 3.79633717479
Area under the ROC curve : 0.786411

In [69]:
class RandomForestc(TopicFeatures):
    def train(self, X, y):
        model = RandomForestClassifier(n_estimators=50)
        model.fit(X, y)
        self.model = model
    def predict_internal(self, X):
        return self.model.predict(X)

model = build_and_test(RandomForestc)


Test error: 0.173482032218
Area under the ROC curve : 0.654986

In [70]:
class RandomForest(TopicFeatures):
    def train(self, X, y):
        model = RandomForestRegressor(n_estimators=30)
        model.fit(X, y)
        self.model = model
    def predict_internal(self, X):
        return self.model.predict(X)

model = build_and_test(RandomForest)


Test error: 0.142010188627
Area under the ROC curve : 0.719929

In [97]:
# Generate predictions on the competition's test set
df_realtest = pd.io.json.read_json('TopicTest1.json')
print len(df_realtest), 'test samples'


1631 test samples

In [98]:
#Training and writing out to csv
model = LinearRegression(df)  # Now train on the full labelled dataset
filename = 'Topic_predictions1.csv'
results = zip(df_realtest['request_id'],
              model.predict(df_realtest))
f = open(filename, 'w')
print >>f, 'request_id, requester_received_pizza'
for a, b in results:
    print >>f, '%s,%f' % (a, b)
f.close()

In [64]:
np.exp([2,2])


Out[64]:
array([ 7.3890561,  7.3890561])

In [ ]: