Shelter Animal Outcomes 4

Logistic Regression



In [1]:

    
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation
from sklearn.feature_selection import RFECV
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd



In [2]:

    
df_train = pd.read_csv('../Shelter_train.csv')
df_test = pd.read_csv('../Shelter_test.csv')



In [3]:

    
X = df_train.ix[:, :-1]
y = df_train.ix[:, -1]
df_test = df_test.drop('ID', 1)



In [4]:

    
clf = LogisticRegression()
cross_validation.cross_val_score(clf, X, y, scoring="log_loss")









    Out[4]:





array([-1.02051202, -1.01842145, -1.02075702])



In [5]:

    
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")



In [6]:

    
pipeline = Pipeline([
        ('featureSelection', RFECV(estimator=LogisticRegression(), scoring='log_loss')),
        ('clf', LogisticRegression())
    ])
pipeline.fit(X, y)
params = {
    'clf__C': [1, 3, 10, 30, 100, 300, 1000]
}
grid_search = GridSearchCV(pipeline, params, n_jobs=-1, scoring='log_loss')
start = time()
grid_search.fit(X, y)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)

predictions = grid_search.predict_proba(df_test)
output = pd.DataFrame(predictions, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
output.index.names = ['ID']
output.index += 1
output.head()









    



GridSearchCV took 41.07 seconds for 7 candidate parameter settings.
Model with rank: 1
Mean validation score: -1.020 (std: 0.001)
Parameters: {'clf__C': 10}

Model with rank: 2
Mean validation score: -1.020 (std: 0.001)
Parameters: {'clf__C': 30}

Model with rank: 3
Mean validation score: -1.020 (std: 0.001)
Parameters: {'clf__C': 100}







    Out[6]:






  
    
      
      Adoption
      Died
      Euthanasia
      Return_to_owner
      Transfer
    
  
  
    
      1
      0.092272
      0.006857
      0.054737
      0.248775
      0.597359
    
    
      2
      0.446318
      0.002682
      0.055541
      0.334910
      0.160550
    
    
      3
      0.675391
      0.004771
      0.030689
      0.048345
      0.240804
    
    
      4
      0.243726
      0.005215
      0.062152
      0.254290
      0.434617
    
    
      5
      0.586253
      0.001345
      0.019162
      0.309073
      0.084166



In [6]:

    
output.to_csv('../submission-logisticRegression.3.0.csv', index_label = 'ID')

	Adoption	Died	Euthanasia	Return_to_owner	Transfer
1	0.092272	0.006857	0.054737	0.248775	0.597359
2	0.446318	0.002682	0.055541	0.334910	0.160550
3	0.675391	0.004771	0.030689	0.048345	0.240804
4	0.243726	0.005215	0.062152	0.254290	0.434617
5	0.586253	0.001345	0.019162	0.309073	0.084166