notebook.community

Edit and run



In [12]:

    
from sklearn.grid_search import GridSearchCV
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import re
from datetime import datetime
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline



In [13]:

    
def clean(s):
        try:
            return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
        except:
            return " ".join(re.findall(r'\w+', "no_text",flags = re.UNICODE | re.LOCALE)).lower()

start = datetime.now()
#donations = pd.read_csv('donations.csv')
projects = pd.read_csv('../data/projects.csv')
outcomes = pd.read_csv('../data/outcomes.csv')
#resources = pd.read_csv('resources.csv')
sample = pd.read_csv('../data/sampleSubmission.csv')
essays = pd.read_csv('../data/essays.csv')


ess_proj = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
#ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort('projectid')



In [14]:

    
projects = projects.merge(outcomes, how ='inner')
projects = projects.merge(essays,how='inner')
del essays
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]
outcomes = np.array(projects.is_exciting)
projects.essay = projects.essay.apply(clean)



In [15]:

    
projects = np.array(projects.essay)
train = projects[train_idx]
del projects



In [16]:

    
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SGDClassifier()),
])



In [18]:

    
parameters = {
    'tfidf__use_idf': [False],
    'tfidf__norm': ('l1', 'l2'),
    'tfidf__max_df':[1.0],
    'tfidf__max_features':(10000,20000),
    #'clf__alpha': (0.001, 0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 100),
    #'clf__loss':('modified_huber','log'),
    
}
grid_search = GridSearchCV(pipeline, parameters, verbose=1, scoring='roc_auc')
grid_search.fit(train, outcomes=='t')









    



[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:  2.7min
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 31.4min finished






    



Fitting 3 folds for each of 4 candidates, totalling 12 fits






    Out[18]:





GridSearchCV(cv=None,
       estimator=Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), nor...ower_t=0.5,
       random_state=None, rho=None, shuffle=False, verbose=0,
       warm_start=False))]),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'tfidf__max_features': (10000, 20000), 'tfidf__max_df': [1.0], 'tfidf__use_idf': [False], 'tfidf__norm': ('l1', 'l2')},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=1)



In [19]:

    
end = datetime.now()
print (end- start)









    



0:50:20.975347



In [20]:

    
grid_search.best_estimator_









    Out[20]:





Pipeline(steps=[('tfidf', TfidfVectorizer(analyzer=u'word', binary=False, charset=None,
        charset_error=None, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=20000, min_df=1,
        ngram_range=(1, 1), no...ower_t=0.5,
       random_state=None, rho=None, shuffle=False, verbose=0,
       warm_start=False))])



In [21]:

    
grid_search.best_params_









    Out[21]:





{'tfidf__max_df': 1.0,
 'tfidf__max_features': 20000,
 'tfidf__norm': 'l1',
 'tfidf__use_idf': False}



In [22]:

    
grid_search.best_score_









    Out[22]:





0.65453184007251464



In [23]:

    
grid_search.grid_scores_









    Out[23]:





[mean: 0.65297, std: 0.00257, params: {'tfidf__max_df': 1.0, 'tfidf__use_idf': False, 'tfidf__max_features': 10000, 'tfidf__norm': 'l1'},
 mean: 0.64724, std: 0.00432, params: {'tfidf__max_df': 1.0, 'tfidf__use_idf': False, 'tfidf__max_features': 10000, 'tfidf__norm': 'l2'},
 mean: 0.65453, std: 0.00137, params: {'tfidf__max_df': 1.0, 'tfidf__use_idf': False, 'tfidf__max_features': 20000, 'tfidf__norm': 'l1'},
 mean: 0.64601, std: 0.00160, params: {'tfidf__max_df': 1.0, 'tfidf__use_idf': False, 'tfidf__max_features': 20000, 'tfidf__norm': 'l2'}]