In [12]:
from sklearn.grid_search import GridSearchCV
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import re
from datetime import datetime
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [13]:
def clean(s):
        try:
            return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
        except:
            return " ".join(re.findall(r'\w+', "no_text",flags = re.UNICODE | re.LOCALE)).lower()

start = datetime.now()
#donations = pd.read_csv('donations.csv')
projects = pd.read_csv('../data/projects.csv')
outcomes = pd.read_csv('../data/outcomes.csv')
#resources = pd.read_csv('resources.csv')
sample = pd.read_csv('../data/sampleSubmission.csv')
essays = pd.read_csv('../data/essays.csv')


ess_proj = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
#ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort('projectid')

In [14]:
projects = projects.merge(outcomes, how ='inner')
projects = projects.merge(essays,how='inner')
del essays
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]
outcomes = np.array(projects.is_exciting)
projects.essay = projects.essay.apply(clean)

In [15]:
projects = np.array(projects.essay)
train = projects[train_idx]
del projects

In [16]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SGDClassifier()),
])

In [18]:
parameters = {
    'tfidf__use_idf': [False],
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__max_df':[1.0],
    'tfidf__max_features':(10000,20000),
    #'clf__alpha': (0.001, 0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 100),
    #'clf__loss':('modified_huber','log'),
    
}
grid_search = GridSearchCV(pipeline, parameters, verbose=1, scoring='roc_auc')
grid_search.fit(train, outcomes=='t')


[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  2.7min
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 31.4min finished
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Out[18]:
GridSearchCV(cv=None,
       estimator=Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), nor...ower_t=0.5,
       random_state=None, rho=None, shuffle=False, verbose=0,
       warm_start=False))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'tfidf__max_features': (10000, 20000), 'tfidf__max_df': [1.0], 'tfidf__use_idf': [False], 'tfidf__norm': ('l1', 'l2')},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=1)

In [19]:
end = datetime.now()
print (end- start)


0:50:20.975347

In [20]:
grid_search.best_estimator_


Out[20]:
Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 1), no...ower_t=0.5,
       random_state=None, rho=None, shuffle=False, verbose=0,
       warm_start=False))])

In [21]:
grid_search.best_params_


Out[21]:
{'tfidf__max_df': 1.0,
 'tfidf__max_features': 20000,
 'tfidf__norm': 'l1',
 'tfidf__use_idf': False}

In [22]:
grid_search.best_score_


Out[22]:
0.65453184007251464

In [23]:
grid_search.grid_scores_


Out[23]:
[mean: 0.65297, std: 0.00257, params: {'tfidf__max_df': 1.0, 'tfidf__use_idf': False, 'tfidf__max_features': 10000, 'tfidf__norm': 'l1'},
 mean: 0.64724, std: 0.00432, params: {'tfidf__max_df': 1.0, 'tfidf__use_idf': False, 'tfidf__max_features': 10000, 'tfidf__norm': 'l2'},
 mean: 0.65453, std: 0.00137, params: {'tfidf__max_df': 1.0, 'tfidf__use_idf': False, 'tfidf__max_features': 20000, 'tfidf__norm': 'l1'},
 mean: 0.64601, std: 0.00160, params: {'tfidf__max_df': 1.0, 'tfidf__use_idf': False, 'tfidf__max_features': 20000, 'tfidf__norm': 'l2'}]