In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.grid_search import GridSearchCV
In [2]:
#donations = pd.read_csv('../data/donations.csv').sort('projectid')
projects = pd.read_csv('../data/projects.csv').sort('projectid')
outcomes = pd.read_csv('../data/outcomes.csv').sort('projectid')
#resources = pd.read_csv('../data/resources.csv').sort('projectid')
sample = pd.read_csv('../data/sampleSubmission.csv').sort('projectid')
#essays = pd.read_csv('../data/essays.csv').sort('projectid')
In [3]:
dates = np.array(projects.date_posted)
train_idx = np.where((dates < '2014-01-01') & (dates > '2010-01-01'))[0]
test_idx = np.where(dates >= '2014-01-01')[0]
In [4]:
outcomes = outcomes.merge(projects[projects.date_posted > '2010-01-01'], how ='inner')
In [5]:
outcomes.shape, train_idx.shape
Out[5]:
In [6]:
projects = projects.fillna(method='pad')
In [7]:
outcomes = np.array(outcomes.is_exciting)
In [8]:
projectCatogorialColumns = ['school_city', 'school_state', 'school_zip', 'school_metro', 'school_district', 'school_county', 'school_charter', 'school_magnet',
'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_prefix', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_subject','primary_focus_area',
'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level',
'students_reached', 'eligible_double_your_impact_match', 'eligible_almost_home_match' ]
In [9]:
projects = np.array(projects[projectCatogorialColumns])
In [10]:
for i in range(0, projects.shape[1]):
le = LabelEncoder()
projects[:,i] = le.fit_transform(projects[:,i])
projects = projects.astype(float)
In [11]:
ohe = OneHotEncoder()
projects = ohe.fit_transform(projects)
In [12]:
train = projects[train_idx]
test = projects[test_idx]
In [13]:
lr = SGDClassifier()
parameters = {'alpha':[0.001],'loss':['log'],
'penalty' : ['l2'], 'n_iter':[100,1000],'n_jobs':[-1]}
clf = GridSearchCV(lr, parameters, scoring = 'roc_auc', n_jobs = -1)
In [14]:
clf.fit(train, outcomes=='t')
Out[14]:
In [15]:
clf.grid_scores_
Out[15]:
In [16]:
clf.best_estimator_
Out[16]:
In [17]:
clf.best_score_
Out[17]:
In [18]:
clf.best_params_
Out[18]:
In [18]: