In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.grid_search import GridSearchCV

In [2]:
#donations = pd.read_csv('../data/donations.csv').sort('projectid')
projects = pd.read_csv('../data/projects.csv').sort('projectid')
outcomes = pd.read_csv('../data/outcomes.csv').sort('projectid')
#resources = pd.read_csv('../data/resources.csv').sort('projectid')
sample = pd.read_csv('../data/sampleSubmission.csv').sort('projectid')
#essays = pd.read_csv('../data/essays.csv').sort('projectid')

In [3]:
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]

In [4]:
projects = projects.fillna(method='pad')

In [5]:
outcomes = np.array(outcomes.is_exciting)

In [6]:
projectCatogorialColumns = ['school_city', 'school_state', 'school_zip', 'school_metro', 'school_district', 'school_county', 'school_charter', 'school_magnet',
 'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_prefix', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_subject','primary_focus_area', 
'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level',
'students_reached', 'eligible_double_your_impact_match', 'eligible_almost_home_match' ]
latitudeLongitudeColumns = ['school_latitude', 'school_longitude']

In [7]:
latitudeLongitude = np.array(projects[latitudeLongitudeColumns])
latitudeLongitude = np.ceil(latitudeLongitude)
latitudeLongitude[:,0] = 180*latitudeLongitude[:,0]+latitudeLongitude[:,1]

In [8]:
data= np.array(projects[projectCatogorialColumns])
data = np.column_stack((data,latitudeLongitude[:,0]))
del projects, latitudeLongitude

In [9]:
for i in range(0, data.shape[1]):
    le = LabelEncoder()
    data[:,i] = le.fit_transform(data[:,i])
data = data.astype(float)

In [10]:
ohe = OneHotEncoder()
data = ohe.fit_transform(data)

In [11]:
data.shape


Out[11]:
(664098, 38673)

In [12]:
train = data[train_idx]
test = data[test_idx]

In [13]:
lr = SGDClassifier()
parameters = {'alpha':[0.0001,0.001,0.01,0.1],'loss':['modified_huber','log'],
'penalty' : ['l2'], 'n_iter':[100,1000],'n_jobs':[-1]}
clf = GridSearchCV(lr, parameters, scoring = 'roc_auc', n_jobs = -1, refit = False, verbose = 3)

In [14]:
clf.fit(train, outcomes=='t')


Out[14]:
GridSearchCV(cv=None,
       estimator=SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5,
       random_state=None, rho=None, shuffle=False, verbose=0,
       warm_start=False),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'penalty': ['l2'], 'alpha': [0.0001, 0.001, 0.01, 0.1], 'n_iter': [100, 1000], 'n_jobs': [-1], 'loss': ['modified_huber', 'log']},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=0)

In [15]:
clf.grid_scores_


Out[15]:
[mean: 0.65235, std: 0.00215, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 100, 'n_jobs': -1, 'loss': 'modified_huber'},
 mean: 0.65974, std: 0.00133, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 1000, 'n_jobs': -1, 'loss': 'modified_huber'},
 mean: 0.65701, std: 0.00062, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 100, 'n_jobs': -1, 'loss': 'log'},
 mean: 0.65691, std: 0.00059, params: {'penalty': 'l2', 'alpha': 0.0001, 'n_iter': 1000, 'n_jobs': -1, 'loss': 'log'},
 mean: 0.66344, std: 0.00055, params: {'penalty': 'l2', 'alpha': 0.001, 'n_iter': 100, 'n_jobs': -1, 'loss': 'modified_huber'},
 mean: 0.66381, std: 0.00058, params: {'penalty': 'l2', 'alpha': 0.001, 'n_iter': 1000, 'n_jobs': -1, 'loss': 'modified_huber'},
 mean: 0.63272, std: 0.00047, params: {'penalty': 'l2', 'alpha': 0.001, 'n_iter': 100, 'n_jobs': -1, 'loss': 'log'},
 mean: 0.63269, std: 0.00046, params: {'penalty': 'l2', 'alpha': 0.001, 'n_iter': 1000, 'n_jobs': -1, 'loss': 'log'},
 mean: 0.64505, std: 0.00051, params: {'penalty': 'l2', 'alpha': 0.01, 'n_iter': 100, 'n_jobs': -1, 'loss': 'modified_huber'},
 mean: 0.64502, std: 0.00050, params: {'penalty': 'l2', 'alpha': 0.01, 'n_iter': 1000, 'n_jobs': -1, 'loss': 'modified_huber'},
 mean: 0.60441, std: 0.00041, params: {'penalty': 'l2', 'alpha': 0.01, 'n_iter': 100, 'n_jobs': -1, 'loss': 'log'},
 mean: 0.60442, std: 0.00041, params: {'penalty': 'l2', 'alpha': 0.01, 'n_iter': 1000, 'n_jobs': -1, 'loss': 'log'},
 mean: 0.61315, std: 0.00083, params: {'penalty': 'l2', 'alpha': 0.1, 'n_iter': 100, 'n_jobs': -1, 'loss': 'modified_huber'},
 mean: 0.61316, std: 0.00084, params: {'penalty': 'l2', 'alpha': 0.1, 'n_iter': 1000, 'n_jobs': -1, 'loss': 'modified_huber'},
 mean: 0.55392, std: 0.00201, params: {'penalty': 'l2', 'alpha': 0.1, 'n_iter': 100, 'n_jobs': -1, 'loss': 'log'},
 mean: 0.55396, std: 0.00201, params: {'penalty': 'l2', 'alpha': 0.1, 'n_iter': 1000, 'n_jobs': -1, 'loss': 'log'}]

In [16]:
clf.best_estimator_


Out[16]:
SGDClassifier(alpha=0.001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='modified_huber', n_iter=1000, n_jobs=-1, penalty='l2',
       power_t=0.5, random_state=None, rho=None, shuffle=False, verbose=0,
       warm_start=False)

In [17]:
clf.best_score_


Out[17]:
0.66381118144892548

In [18]:
clf.best_params_


Out[18]:
{'alpha': 0.001,
 'loss': 'modified_huber',
 'n_iter': 1000,
 'n_jobs': -1,
 'penalty': 'l2'}