In [246]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pylab as pl
import pandas as pd
import numpy as np
import sklearn
In [247]:
trainCSV = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/train.csv'
testCSV = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/test.csv'
trainLabel = '/devSOLR-Crawler/img/gkt/kaggle/data_science_london_scikit/trainLabels.csv'
In [248]:
train_df = pd.read_csv(trainCSV, header=None)
test_df = pd.read_csv(testCSV, header=None)
trainClass_df = pd.read_csv(trainLabel, header=None)
In [249]:
type(trainClass_df), type(train_df), type(test_df)
Out[249]:
In [250]:
train_df.shape, test_df.shape, trainClass_df.shape
Out[250]:
In [151]:
train_df.ix[2][:10]
Out[151]:
In [48]:
trainClass_df.ix[:10]
Out[48]:
In [57]:
obj = pd.Series(np.random.permutation(xrange(10)))
obj
Out[57]:
In [63]:
obj.values
Out[63]:
In [68]:
obj[obj > 2]
Out[68]:
In [69]:
np.exp(obj)
Out[69]:
In [70]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002],
'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
In [77]:
frame.ix[2:3]
Out[77]:
In [251]:
X = train_df.as_matrix()
test = test_df.as_matrix()
y = trainClass_df.as_matrix()
#clf_rf = RandomForestClassifier(n_estimators=20)
#clf_rf.fit(train_df.ix, trainClass_df.ix)
In [252]:
X.shape, y.shape, test.shape
Out[252]:
In [91]:
np.asarray(y[:5])
Out[91]:
In [154]:
clf_rf = RandomForestClassifier(n_estimators=20)
clf_rf.fit(X,y.ravel())
Out[154]:
In [155]:
cross_val_score(clf_rf, X,y.ravel(),cv=5)
Out[155]:
In [96]:
y.ravel()[:5]
Out[96]:
In [156]:
clf_logreg = LogisticRegression(penalty='l2', C=1.0, random_state=44)
clf_logreg.fit(X, y.ravel())
cross_val_score(clf_logreg, X, y.ravel(), cv=5)
Out[156]:
In [169]:
from sklearn.decomposition import RandomizedPCA, PCA, NMF
from sklearn.svm import SVC
decompt = [RandomizedPCA(), PCA()]
for reduction in decompt:
X_red = reduction.fit(X, y.ravel()).transform(X)
#clf_red = RandomForestClassifier(n_estimators=500)
#clf_red = LogisticRegression(penalty='l2')
clf_red = SVC(C=10, gamma=.01)
clf_red.fit(X_red, y.ravel())
print cross_val_score(clf_red, X_red, y.ravel(), cv=5)
In [158]:
X_pca = RandomizedPCA(n_components=2).fit_transform(X)
In [159]:
X_pca.shape
Out[159]:
In [160]:
from itertools import cycle
color = ['r','g']
for i,c in zip(np.unique(y.ravel()), cycle(color)):
pl.scatter(X_pca[y.ravel()==i,0], X_pca[y.ravel()==i,1], label=str(i), c=c, alpha=.7)
pl.legend(loc='best')
In [161]:
from sklearn.preprocessing import MinMaxScaler
X_scaled = MinMaxScaler().fit_transform(X,y.ravel())
In [163]:
X_scaled[0]
Out[163]:
In [168]:
X_scaled.max()
Out[168]:
In [254]:
from sklearn.decomposition import RandomizedPCA, PCA, NMF
from sklearn.svm import SVC
decompt = [RandomizedPCA(), PCA()]
for reduction in decompt:
X_red = reduction.fit(X_scaled, y.ravel()).transform(X)
#clf_red = RandomForestClassifier(n_estimators=500)
#clf_red = LogisticRegression(penalty='l2')
clf_red = SVC(C=10, gamma=.01)
clf_red.fit(X_red, y.ravel())
scores = cross_val_score(clf_red, X_red, y.ravel(), cv=5)
print scores
print scores.mean()
In [178]:
X_red[0]
Out[178]:
In [182]:
from sklearn.ensemble import ExtraTreesClassifier
# Build a forest and compute the feature importances
forest = ExtraTreesClassifier(n_estimators=2000,n_jobs=10,
random_state=0)
forest.fit(X, y.ravel())
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
num_features = 40
# Plot the feature importances of the forest
import pylab as pl
pl.figure()
pl.title("Feature importances")
pl.bar(range(num_features), importances[indices],
color="r", yerr=std[indices], align="center")
pl.xticks(range(num_features), indices)
pl.xlim([-1, num_features])
pl.show()
In [265]:
clf_rf = RandomForestClassifier(n_estimators=200, random_state=22, n_jobs=6)
clf_rf.fit(X,y.ravel())
# threshold value found after getting all the features importances
# and seeing upto what we want to pick the features
X_rf = clf_rf.transform(X, threshold=0.013165079409344109)
In [266]:
X_rf.shape
Out[266]:
In [257]:
sorted(clf_rf.feature_importances_, reverse=True)
Out[257]:
In [263]:
clf_red = SVC(C=10, gamma=.01, tol=.0011)
clf_red.fit(X_rf, y.ravel())
scores = cross_val_score(clf_red, X_rf, y.ravel(), cv=5)
print scores
print scores.mean()
In [207]:
np.std(clf_rf.feature_importances_)
Out[207]:
In [211]:
X_rf_test = clf_rf.transform(test,threshold=0.013165079409344109)
X_rf_test.shape
Out[211]:
In [212]:
y_predicted = clf_red.predict(X_rf_test)
In [235]:
print y_predicted[:5]
print clf_red
In [216]:
submissionFile = '/home/ubuntu/stableDisk/img/gkt/kaggle/data_science_london_scikit/submission_2.csv'
np.savetxt(submissionFile, y_predicted, fmt='%d', newline='\r\n')
In [222]:
from sklearn.naive_bayes import GaussianNB
clf_nb = GaussianNB().fit(X_scaled,y.ravel())
scores = cross_val_score(clf_nb, X_scaled, y.ravel(), cv=5)
print scores
print scores.mean()
In [243]:
Cs=np.logspace(-1, 2, num=4)
gammas = np.logspace(-4, 0, num=5)
svc_params = {
'C':Cs,
'gamma':gammas
}
from pprint import pprint
pprint(svc_params)
In [297]:
from sklearn.grid_search import GridSearchCV
from sklearn import svm
clf_svm_cv = GridSearchCV(svm.SVC(),svc_params, cv=3, n_jobs=-1)
clf_svm_cv.fit(X_rf,y.ravel())
print clf_svm_cv.best_params_
print clf_svm_cv.best_score_
scores = cross_val_score(clf_svm_cv, X_rf, y.ravel(), cv=10)
print scores
print scores.mean()
In [299]:
from sklearn.neighbors import KNeighborsClassifier
clf_knn = KNeighborsClassifier(n_neighbors=8, algorithm='auto', weights='distance', p=1, leaf_size=10)
print clf_knn
clf_knn.fit(X_rf, y.ravel())
scores = cross_val_score(clf_knn, X_rf, y.ravel(), cv=10)
print scores
print scores.mean()
In [281]:
KNeighborsClassifier?
In [ ]: