notebook.community

Edit and run



In [246]:

    
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pylab as pl
import pandas as pd
import numpy as np
import sklearn



In [247]:

    
trainCSV = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/train.csv'
testCSV = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/test.csv'
trainLabel = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/trainLabels.csv'



In [248]:

    
train_df = pd.read_csv(trainCSV, header=None)
test_df = pd.read_csv(testCSV, header=None)
trainClass_df = pd.read_csv(trainLabel, header=None)



In [249]:

    
type(trainClass_df), type(train_df), type(test_df)









    Out[249]:





(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)



In [250]:

    
train_df.shape, test_df.shape, trainClass_df.shape









    Out[250]:





((1000, 40), (9000, 40), (1000, 1))



In [151]:

    
train_df.ix[2][:10]









    Out[151]:





X.1     1.192222
X.2    -0.414371
X.3     0.067054
X.4    -2.233568
X.5     3.658881
X.6     0.089007
X.7     0.203439
X.8    -4.219054
X.9    -1.184919
X.10   -1.240310
Name: 2



In [48]:

    
trainClass_df.ix[:10]



In [57]:

    
obj = pd.Series(np.random.permutation(xrange(10)))
obj



In [63]:

    
obj.values









    Out[63]:





array([5, 8, 1, 0, 6, 2, 7, 9, 3, 4])



In [68]:

    
obj[obj > 2]



In [69]:

    
np.exp(obj)









    Out[69]:





0     148.413159
1    2980.957987
2       2.718282
3       1.000000
4     403.428793
5       7.389056
6    1096.633158
7    8103.083928
8      20.085537
9      54.598150



In [70]:

    
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)



In [77]:

    
frame.ix[2:3]









    Out[77]:





   pop   state  year
2  3.6    Ohio  2002
3  2.4  Nevada  2001



In [251]:

    
X =  train_df.as_matrix()
test = test_df.as_matrix()
y = trainClass_df.as_matrix()
#clf_rf = RandomForestClassifier(n_estimators=20)
#clf_rf.fit(train_df.ix, trainClass_df.ix)



In [252]:

    
X.shape, y.shape, test.shape









    Out[252]:





((1000, 40), (1000, 1), (9000, 40))



In [91]:

    
np.asarray(y[:5])









    Out[91]:





array([[1],
       [0],
       [0],
       [1],
       [0]])



In [154]:

    
clf_rf = RandomForestClassifier(n_estimators=20)
clf_rf.fit(X,y.ravel())









    Out[154]:





RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)



In [155]:

    
cross_val_score(clf_rf, X,y.ravel(),cv=5)









    Out[155]:





array([ 0.78 ,  0.89 ,  0.885,  0.855,  0.855])



In [96]:

    
y.ravel()[:5]









    Out[96]:





array([1, 0, 0, 1, 0])



In [156]:

    
clf_logreg = LogisticRegression(penalty='l2', C=1.0, random_state=44)
clf_logreg.fit(X, y.ravel())
cross_val_score(clf_logreg, X, y.ravel(), cv=5)









    Out[156]:





array([ 0.765,  0.85 ,  0.805,  0.845,  0.82 ])



In [169]:

    
from sklearn.decomposition import RandomizedPCA, PCA, NMF
from sklearn.svm import SVC
decompt = [RandomizedPCA(), PCA()]
for reduction in decompt:
    X_red = reduction.fit(X, y.ravel()).transform(X)
    #clf_red = RandomForestClassifier(n_estimators=500)
    #clf_red = LogisticRegression(penalty='l2')
    clf_red = SVC(C=10, gamma=.01)
    
    clf_red.fit(X_red, y.ravel())
    print cross_val_score(clf_red, X_red, y.ravel(), cv=5)









    



[ 0.875  0.93   0.93   0.935  0.915]
[ 0.875  0.93   0.93   0.935  0.915]



In [158]:

    
X_pca = RandomizedPCA(n_components=2).fit_transform(X)



In [159]:

    
X_pca.shape









    Out[159]:





(1000, 2)



In [160]:

    
from itertools import cycle
color =  ['r','g']
for i,c in zip(np.unique(y.ravel()), cycle(color)):
    pl.scatter(X_pca[y.ravel()==i,0], X_pca[y.ravel()==i,1], label=str(i), c=c, alpha=.7)
    pl.legend(loc='best')



In [161]:

    
from sklearn.preprocessing import MinMaxScaler
X_scaled = MinMaxScaler().fit_transform(X,y.ravel())



In [163]:

    
X_scaled[0]









    Out[163]:





array([ 0.54768934,  0.3201633 ,  0.8000649 ,  0.34879341,  0.63882063,
        0.52887636,  0.6498604 ,  0.59414052,  0.4918225 ,  0.51175956,
        0.28236241,  0.66643806,  0.58070021,  0.7745685 ,  0.8069802 ,
        0.49930528,  0.57952714,  0.53844392,  0.63734005,  0.89939759,
        0.44212449,  0.50575269,  0.47782523,  0.6726211 ,  0.44432756,
        0.42520146,  0.56487789,  0.61511922,  0.42339089,  0.28077799,
        0.40628762,  0.3526166 ,  0.40048828,  0.51825714,  0.81316446,
        0.57586941,  0.72647689,  0.10885793,  0.38835347,  0.54099147])



In [168]:

    
X_scaled.max()









    Out[168]:





1.0000000000000002



In [254]:

    
from sklearn.decomposition import RandomizedPCA, PCA, NMF
from sklearn.svm import SVC
decompt = [RandomizedPCA(), PCA()]
for reduction in decompt:
    X_red = reduction.fit(X_scaled, y.ravel()).transform(X)
    #clf_red = RandomForestClassifier(n_estimators=500)
    #clf_red = LogisticRegression(penalty='l2')
    clf_red = SVC(C=10, gamma=.01)
    
    clf_red.fit(X_red, y.ravel())
    scores = cross_val_score(clf_red, X_red, y.ravel(), cv=5)
    print scores
    print scores.mean()









    



[ 0.875  0.93   0.93   0.935  0.915]
0.917
[ 0.875  0.93   0.93   0.935  0.915]
0.917



In [178]:

    
X_red[0]









    Out[178]:





array([ 1.15831635, -7.57808084, -3.35627998, -0.28157474,  0.53774788,
       -0.40833054, -0.37155672, -0.43163425, -0.26714508, -0.95677181,
       -0.68485453,  0.16389513,  0.04926728,  0.04623549,  3.04515294,
       -0.93337112, -1.21653471, -0.21893167, -0.79716191, -1.36219637,
        2.92459008,  1.49172575,  0.92718838, -0.04762693,  1.76013204,
        0.40627476, -0.56072495, -1.81950206,  1.40847085, -0.08780133,
       -2.19118625, -0.60520328,  2.3122273 , -3.01917332,  2.74249363,
       -1.98633736,  0.67228068,  0.55238447,  0.97608066,  1.25358421])



In [182]:

    
from sklearn.ensemble import ExtraTreesClassifier

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=2000,n_jobs=10,
                                 random_state=0)
forest.fit(X, y.ravel())
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

num_features = 40
# Plot the feature importances of the forest
import pylab as pl
pl.figure()
pl.title("Feature importances")
pl.bar(range(num_features), importances[indices],
       color="r", yerr=std[indices], align="center")
pl.xticks(range(num_features), indices)
pl.xlim([-1, num_features])
pl.show()



In [265]:

    
clf_rf = RandomForestClassifier(n_estimators=200, random_state=22, n_jobs=6)
clf_rf.fit(X,y.ravel())
# threshold value found after getting all the features importances 
# and seeing upto what we want to pick the features
X_rf = clf_rf.transform(X, threshold=0.013165079409344109)



In [266]:

    
X_rf.shape









    Out[266]:





(1000, 18)



In [257]:

    
sorted(clf_rf.feature_importances_, reverse=True)









    Out[257]:





[0.11779691766383177,
 0.10942790089603846,
 0.056896868253683619,
 0.052756395591547357,
 0.048772394313168273,
 0.039569878234191017,
 0.037954002987930571,
 0.037950769164491353,
 0.035910014589998478,
 0.0358845435456739,
 0.035825873724102696,
 0.028504295413853079,
 0.023918387796585582,
 0.021890553938800811,
 0.020087092467018661,
 0.013895553203342594,
 0.013385027314425329,
 0.013165079409344109,
 0.012913660123578741,
 0.012884824700286972,
 0.012633388317349184,
 0.012598834600102663,
 0.012590324576193714,
 0.01249710973738179,
 0.012262423024405793,
 0.01224744686975999,
 0.012061207614709206,
 0.011880140782232023,
 0.011874643387291555,
 0.01187387710929271,
 0.011718671332569864,
 0.011167738977942068,
 0.011118088349293075,
 0.011078358456413879,
 0.010973554557016884,
 0.010970399453775124,
 0.0108878660415809,
 0.010785351905307428,
 0.0099746481771248105,
 0.0094158933983640239]



In [263]:

    
clf_red = SVC(C=10, gamma=.01, tol=.0011)   
clf_red.fit(X_rf, y.ravel())
scores = cross_val_score(clf_red, X_rf, y.ravel(), cv=5)
print scores
print scores.mean()









    



[ 0.905  0.945  0.955  0.93   0.91 ]
0.929



In [207]:

    
np.std(clf_rf.feature_importances_)









    Out[207]:





0.024174820931845715



In [211]:

    
X_rf_test = clf_rf.transform(test,threshold=0.013165079409344109)
X_rf_test.shape









    Out[211]:





(9000, 18)



In [212]:

    
y_predicted = clf_red.predict(X_rf_test)



In [235]:

    
print y_predicted[:5]
print clf_red









    



[1 0 1 0 0]
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.01,
  kernel=rbf, max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [216]:

    
submissionFile = '/home/ubuntu/stableDisk/img/gkt/kaggle/data_science_london_scikit/submission_2.csv'
np.savetxt(submissionFile, y_predicted, fmt='%d', newline='\r\n')



In [222]:

    
from sklearn.naive_bayes import GaussianNB
clf_nb = GaussianNB().fit(X_scaled,y.ravel())
scores = cross_val_score(clf_nb, X_scaled, y.ravel(), cv=5)
print scores
print scores.mean()









    



[ 0.735  0.85   0.84   0.85   0.835]
0.822



In [243]:

    
Cs=np.logspace(-1, 2, num=4)
gammas = np.logspace(-4, 0, num=5)
svc_params = {
        'C':Cs,
        'gamma':gammas
        }
from pprint import pprint
pprint(svc_params)









    



{'C': array([   0.1,    1. ,   10. ,  100. ]),
 'gamma': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ,  1.    ])}



In [297]:

    
from sklearn.grid_search import GridSearchCV
from sklearn import svm

clf_svm_cv = GridSearchCV(svm.SVC(),svc_params, cv=3, n_jobs=-1)
clf_svm_cv.fit(X_rf,y.ravel())

print clf_svm_cv.best_params_
print clf_svm_cv.best_score_

scores = cross_val_score(clf_svm_cv, X_rf, y.ravel(), cv=10)
print scores
print scores.mean()









    



{'C': 100.0, 'gamma': 0.001}
0.914
[ 0.87  0.98  0.95  0.98  0.91  0.93  0.9   0.94  0.91  0.9 ]
0.927



In [299]:

    
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=8, algorithm='auto', weights='distance', p=1, leaf_size=10)
print clf_knn
clf_knn.fit(X_rf, y.ravel())
scores = cross_val_score(clf_knn, X_rf, y.ravel(), cv=10)
print scores
print scores.mean()









    



KNeighborsClassifier(algorithm=auto, leaf_size=10, metric=minkowski,
           n_neighbors=8, p=1, weights=distance)
[ 0.88  0.94  0.97  0.94  0.91  0.9   0.92  0.93  0.92  0.91]
0.922



In [281]:

    
KNeighborsClassifier?



In [ ]: