In [246]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pylab as pl
import pandas as pd
import numpy as np
import sklearn

In [247]:
trainCSV = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/train.csv'
testCSV = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/test.csv'
trainLabel = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/trainLabels.csv'

In [248]:
train_df = pd.read_csv(trainCSV, header=None)
test_df = pd.read_csv(testCSV, header=None)
trainClass_df = pd.read_csv(trainLabel, header=None)

In [249]:
type(trainClass_df), type(train_df), type(test_df)


Out[249]:
(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame)

In [250]:
train_df.shape, test_df.shape, trainClass_df.shape


Out[250]:
((1000, 40), (9000, 40), (1000, 1))

In [151]:
train_df.ix[2][:10]


Out[151]:
X.1     1.192222
X.2    -0.414371
X.3     0.067054
X.4    -2.233568
X.5     3.658881
X.6     0.089007
X.7     0.203439
X.8    -4.219054
X.9    -1.184919
X.10   -1.240310
Name: 2

In [48]:
trainClass_df.ix[:10]


Out[48]:
    X.1
0     1
1     0
2     0
3     1
4     0
5     1
6     0
7     1
8     1
9     0
10    1

In [57]:
obj = pd.Series(np.random.permutation(xrange(10)))
obj


Out[57]:
0    5
1    8
2    1
3    0
4    6
5    2
6    7
7    9
8    3
9    4

In [63]:
obj.values


Out[63]:
array([5, 8, 1, 0, 6, 2, 7, 9, 3, 4])

In [68]:
obj[obj > 2]


Out[68]:
0    5
1    8
4    6
6    7
7    9
8    3
9    4

In [69]:
np.exp(obj)


Out[69]:
0     148.413159
1    2980.957987
2       2.718282
3       1.000000
4     403.428793
5       7.389056
6    1096.633158
7    8103.083928
8      20.085537
9      54.598150

In [70]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)

In [77]:
frame.ix[2:3]


Out[77]:
   pop   state  year
2  3.6    Ohio  2002
3  2.4  Nevada  2001

In [251]:
X =  train_df.as_matrix()
test = test_df.as_matrix()
y = trainClass_df.as_matrix()
#clf_rf = RandomForestClassifier(n_estimators=20)
#clf_rf.fit(train_df.ix, trainClass_df.ix)

In [252]:
X.shape, y.shape, test.shape


Out[252]:
((1000, 40), (1000, 1), (9000, 40))

In [91]:
np.asarray(y[:5])


Out[91]:
array([[1],
       [0],
       [0],
       [1],
       [0]])

In [154]:
clf_rf = RandomForestClassifier(n_estimators=20)
clf_rf.fit(X,y.ravel())


Out[154]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)

In [155]:
cross_val_score(clf_rf, X,y.ravel(),cv=5)


Out[155]:
array([ 0.78 ,  0.89 ,  0.885,  0.855,  0.855])

In [96]:
y.ravel()[:5]


Out[96]:
array([1, 0, 0, 1, 0])

In [156]:
clf_logreg = LogisticRegression(penalty='l2', C=1.0, random_state=44)
clf_logreg.fit(X, y.ravel())
cross_val_score(clf_logreg, X, y.ravel(), cv=5)


Out[156]:
array([ 0.765,  0.85 ,  0.805,  0.845,  0.82 ])

In [169]:
from sklearn.decomposition import RandomizedPCA, PCA, NMF
from sklearn.svm import SVC
decompt = [RandomizedPCA(), PCA()]
for reduction in decompt:
    X_red = reduction.fit(X, y.ravel()).transform(X)
    #clf_red = RandomForestClassifier(n_estimators=500)
    #clf_red = LogisticRegression(penalty='l2')
    clf_red = SVC(C=10, gamma=.01)
    
    clf_red.fit(X_red, y.ravel())
    print cross_val_score(clf_red, X_red, y.ravel(), cv=5)


[ 0.875  0.93   0.93   0.935  0.915]
[ 0.875  0.93   0.93   0.935  0.915]

In [158]:
X_pca = RandomizedPCA(n_components=2).fit_transform(X)

In [159]:
X_pca.shape


Out[159]:
(1000, 2)

In [160]:
from itertools import cycle
color =  ['r','g']
for i,c in zip(np.unique(y.ravel()), cycle(color)):
    pl.scatter(X_pca[y.ravel()==i,0], X_pca[y.ravel()==i,1], label=str(i), c=c, alpha=.7)
    pl.legend(loc='best')



In [161]:
from sklearn.preprocessing import MinMaxScaler
X_scaled = MinMaxScaler().fit_transform(X,y.ravel())

In [163]:
X_scaled[0]


Out[163]:
array([ 0.54768934,  0.3201633 ,  0.8000649 ,  0.34879341,  0.63882063,
        0.52887636,  0.6498604 ,  0.59414052,  0.4918225 ,  0.51175956,
        0.28236241,  0.66643806,  0.58070021,  0.7745685 ,  0.8069802 ,
        0.49930528,  0.57952714,  0.53844392,  0.63734005,  0.89939759,
        0.44212449,  0.50575269,  0.47782523,  0.6726211 ,  0.44432756,
        0.42520146,  0.56487789,  0.61511922,  0.42339089,  0.28077799,
        0.40628762,  0.3526166 ,  0.40048828,  0.51825714,  0.81316446,
        0.57586941,  0.72647689,  0.10885793,  0.38835347,  0.54099147])

In [168]:
X_scaled.max()


Out[168]:
1.0000000000000002

In [254]:
from sklearn.decomposition import RandomizedPCA, PCA, NMF
from sklearn.svm import SVC
decompt = [RandomizedPCA(), PCA()]
for reduction in decompt:
    X_red = reduction.fit(X_scaled, y.ravel()).transform(X)
    #clf_red = RandomForestClassifier(n_estimators=500)
    #clf_red = LogisticRegression(penalty='l2')
    clf_red = SVC(C=10, gamma=.01)
    
    clf_red.fit(X_red, y.ravel())
    scores = cross_val_score(clf_red, X_red, y.ravel(), cv=5)
    print scores
    print scores.mean()


[ 0.875  0.93   0.93   0.935  0.915]
0.917
[ 0.875  0.93   0.93   0.935  0.915]
0.917

In [178]:
X_red[0]


Out[178]:
array([ 1.15831635, -7.57808084, -3.35627998, -0.28157474,  0.53774788,
       -0.40833054, -0.37155672, -0.43163425, -0.26714508, -0.95677181,
       -0.68485453,  0.16389513,  0.04926728,  0.04623549,  3.04515294,
       -0.93337112, -1.21653471, -0.21893167, -0.79716191, -1.36219637,
        2.92459008,  1.49172575,  0.92718838, -0.04762693,  1.76013204,
        0.40627476, -0.56072495, -1.81950206,  1.40847085, -0.08780133,
       -2.19118625, -0.60520328,  2.3122273 , -3.01917332,  2.74249363,
       -1.98633736,  0.67228068,  0.55238447,  0.97608066,  1.25358421])

In [182]:
from sklearn.ensemble import ExtraTreesClassifier

# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=2000,n_jobs=10,
                                 random_state=0)
forest.fit(X, y.ravel())
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

num_features = 40
# Plot the feature importances of the forest
import pylab as pl
pl.figure()
pl.title("Feature importances")
pl.bar(range(num_features), importances[indices],
       color="r", yerr=std[indices], align="center")
pl.xticks(range(num_features), indices)
pl.xlim([-1, num_features])
pl.show()



In [265]:
clf_rf = RandomForestClassifier(n_estimators=200, random_state=22, n_jobs=6)
clf_rf.fit(X,y.ravel())
# threshold value found after getting all the features importances 
# and seeing upto what we want to pick the features
X_rf = clf_rf.transform(X, threshold=0.013165079409344109)

In [266]:
X_rf.shape


Out[266]:
(1000, 18)

In [257]:
sorted(clf_rf.feature_importances_, reverse=True)


Out[257]:
[0.11779691766383177,
 0.10942790089603846,
 0.056896868253683619,
 0.052756395591547357,
 0.048772394313168273,
 0.039569878234191017,
 0.037954002987930571,
 0.037950769164491353,
 0.035910014589998478,
 0.0358845435456739,
 0.035825873724102696,
 0.028504295413853079,
 0.023918387796585582,
 0.021890553938800811,
 0.020087092467018661,
 0.013895553203342594,
 0.013385027314425329,
 0.013165079409344109,
 0.012913660123578741,
 0.012884824700286972,
 0.012633388317349184,
 0.012598834600102663,
 0.012590324576193714,
 0.01249710973738179,
 0.012262423024405793,
 0.01224744686975999,
 0.012061207614709206,
 0.011880140782232023,
 0.011874643387291555,
 0.01187387710929271,
 0.011718671332569864,
 0.011167738977942068,
 0.011118088349293075,
 0.011078358456413879,
 0.010973554557016884,
 0.010970399453775124,
 0.0108878660415809,
 0.010785351905307428,
 0.0099746481771248105,
 0.0094158933983640239]

In [263]:
clf_red = SVC(C=10, gamma=.01, tol=.0011)   
clf_red.fit(X_rf, y.ravel())
scores = cross_val_score(clf_red, X_rf, y.ravel(), cv=5)
print scores
print scores.mean()


[ 0.905  0.945  0.955  0.93   0.91 ]
0.929

In [207]:
np.std(clf_rf.feature_importances_)


Out[207]:
0.024174820931845715

In [211]:
X_rf_test = clf_rf.transform(test,threshold=0.013165079409344109)
X_rf_test.shape


Out[211]:
(9000, 18)

In [212]:
y_predicted = clf_red.predict(X_rf_test)

In [235]:
print y_predicted[:5]
print clf_red


[1 0 1 0 0]
SVC(C=10, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.01,
  kernel=rbf, max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [216]:
submissionFile = '/home/ubuntu/stableDisk/img/gkt/kaggle/data_science_london_scikit/submission_2.csv'
np.savetxt(submissionFile, y_predicted, fmt='%d', newline='\r\n')

In [222]:
from sklearn.naive_bayes import GaussianNB
clf_nb = GaussianNB().fit(X_scaled,y.ravel())
scores = cross_val_score(clf_nb, X_scaled, y.ravel(), cv=5)
print scores
print scores.mean()


[ 0.735  0.85   0.84   0.85   0.835]
0.822

In [243]:
Cs=np.logspace(-1, 2, num=4)
gammas = np.logspace(-4, 0, num=5)
svc_params = {
        'C':Cs,
        'gamma':gammas
        }
from pprint import pprint
pprint(svc_params)


{'C': array([   0.1,    1. ,   10. ,  100. ]),
 'gamma': array([ 0.0001,  0.001 ,  0.01  ,  0.1   ,  1.    ])}

In [297]:
from sklearn.grid_search import GridSearchCV
from sklearn import svm

clf_svm_cv = GridSearchCV(svm.SVC(),svc_params, cv=3, n_jobs=-1)
clf_svm_cv.fit(X_rf,y.ravel())

print clf_svm_cv.best_params_
print clf_svm_cv.best_score_

scores = cross_val_score(clf_svm_cv, X_rf, y.ravel(), cv=10)
print scores
print scores.mean()


{'C': 100.0, 'gamma': 0.001}
0.914
[ 0.87  0.98  0.95  0.98  0.91  0.93  0.9   0.94  0.91  0.9 ]
0.927

In [299]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=8, algorithm='auto', weights='distance', p=1, leaf_size=10)
print clf_knn
clf_knn.fit(X_rf, y.ravel())
scores = cross_val_score(clf_knn, X_rf, y.ravel(), cv=10)
print scores
print scores.mean()


KNeighborsClassifier(algorithm=auto, leaf_size=10, metric=minkowski,
           n_neighbors=8, p=1, weights=distance)
[ 0.88  0.94  0.97  0.94  0.91  0.9   0.92  0.93  0.92  0.91]
0.922

In [281]:
KNeighborsClassifier?

In [ ]: