In [12]:
from sklearn.grid_search import GridSearchCV
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
import re
from datetime import datetime
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
In [13]:
def clean(s):
try:
return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
except:
return " ".join(re.findall(r'\w+', "no_text",flags = re.UNICODE | re.LOCALE)).lower()
start = datetime.now()
#donations = pd.read_csv('donations.csv')
projects = pd.read_csv('../data/projects.csv')
outcomes = pd.read_csv('../data/outcomes.csv')
#resources = pd.read_csv('resources.csv')
sample = pd.read_csv('../data/sampleSubmission.csv')
essays = pd.read_csv('../data/essays.csv')
ess_proj = essays.sort('projectid')
projects = projects.sort('projectid')
sample = sample.sort('projectid')
#ess_proj = pd.merge(essays, projects, on='projectid')
outcomes = outcomes.sort('projectid')
In [14]:
projects = projects.merge(outcomes, how ='inner')
projects = projects.merge(essays,how='inner')
del essays
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]
outcomes = np.array(projects.is_exciting)
projects.essay = projects.essay.apply(clean)
In [15]:
projects = np.array(projects.essay)
train = projects[train_idx]
del projects
In [16]:
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('clf', SGDClassifier()),
])
In [18]:
parameters = {
'tfidf__use_idf': [False],
'tfidf__norm': ('l1', 'l2'),
'tfidf__max_df':[1.0],
'tfidf__max_features':(10000,20000),
#'clf__alpha': (0.001, 0.00001, 0.000001),
#'clf__penalty': ('l2', 'elasticnet'),
#'clf__n_iter': (10, 100),
#'clf__loss':('modified_huber','log'),
}
grid_search = GridSearchCV(pipeline, parameters, verbose=1, scoring='roc_auc')
grid_search.fit(train, outcomes=='t')
Out[18]:
In [19]:
end = datetime.now()
print (end- start)
In [20]:
grid_search.best_estimator_
Out[20]:
In [21]:
grid_search.best_params_
Out[21]:
In [22]:
grid_search.best_score_
Out[22]:
In [23]:
grid_search.grid_scores_
Out[23]: