In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.grid_search import GridSearchCV

In [2]:
#donations = pd.read_csv('../data/donations.csv').sort('projectid')
projects = pd.read_csv('../data/projects.csv').sort('projectid')
outcomes = pd.read_csv('../data/outcomes.csv').sort('projectid')
#resources = pd.read_csv('../data/resources.csv').sort('projectid')
sample = pd.read_csv('../data/sampleSubmission.csv').sort('projectid')
#essays = pd.read_csv('../data/essays.csv').sort('projectid')

In [3]:
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]

In [4]:
projects = projects.fillna(method='pad')

In [5]:
outcomes = np.array(outcomes.is_exciting)

In [6]:
projectCatogorialColumns = ['school_city', 'school_state', 'school_zip', 'school_metro', 'school_district', 'school_county', 'school_charter', 'school_magnet',
 'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_prefix', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_subject','primary_focus_area', 
'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level',
'students_reached', 'eligible_double_your_impact_match', 'eligible_almost_home_match' ]

In [7]:
projects = np.array(projects[projectCatogorialColumns])

In [8]:
for i in range(0, projects.shape[1]):
    le = LabelEncoder()
    projects[:,i] = le.fit_transform(projects[:,i])
projects = projects.astype(float)

In [9]:
ohe = OneHotEncoder()
projects = ohe.fit_transform(projects)

In [10]:
train = projects[train_idx]
test = projects[test_idx]

In [11]:
lr = LogisticRegression()
parameters = {'C':[0.01,0.1,1,10],'penalty':['l1','l2'] }
clf = GridSearchCV(lr, parameters, scoring = 'roc_auc', n_jobs = -1)

In [12]:
clf.fit(train, outcomes=='t')


Out[12]:
GridSearchCV(cv=None,
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001),
       fit_params={}, iid=True, loss_func=None, n_jobs=-1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='roc_auc', verbose=0)

In [13]:
clf.grid_scores_


Out[13]:
[mean: 0.60800, std: 0.00078, params: {'penalty': 'l1', 'C': 0.01},
 mean: 0.64642, std: 0.00068, params: {'penalty': 'l2', 'C': 0.01},
 mean: 0.63969, std: 0.00061, params: {'penalty': 'l1', 'C': 0.1},
 mean: 0.66460, std: 0.00092, params: {'penalty': 'l2', 'C': 0.1},
 mean: 0.66139, std: 0.00081, params: {'penalty': 'l1', 'C': 1},
 mean: 0.65921, std: 0.00128, params: {'penalty': 'l2', 'C': 1},
 mean: 0.64540, std: 0.00178, params: {'penalty': 'l1', 'C': 10},
 mean: 0.64633, std: 0.00176, params: {'penalty': 'l2', 'C': 10}]

In [14]:
clf.best_estimator_


Out[14]:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)

In [15]:
clf.best_score_


Out[15]:
0.66459685008753022

In [16]:
clf.best_params_


Out[16]:
{'C': 0.1, 'penalty': 'l2'}