In [1]:
    
import pandas as pd
import matplotlib.pyplot as plt
    
In [2]:
    
train_df = pd.read_csv("training_data.txt")
    
In [3]:
    
train_label_df = pd.read_csv("training_labels.txt")
    
In [4]:
    
test_df = pd.read_csv("testing_data.txt")
    
In [5]:
    
test_label_df = pd.read_csv("testing_labels.txt")
    
In [6]:
    
X_train = train_df.iloc[:,:22].values
    
In [7]:
    
y_train = train_label_df.iloc[:].values
y_train = y_train.reshape(155)
    
In [8]:
    
X_test = test_df.iloc[:,:22].values
    
In [9]:
    
y_test = test_label_df.iloc[:].values
y_test = y_test.reshape(40)
    
In [10]:
    
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
    
In [11]:
    
graph_X = []
graph_Y = []
for i in range(2,22):
    logistic_pipe = Pipeline([('scl', StandardScaler()),('pca', PCA(n_components=i)),('clf',LogisticRegression(random_state=1))])
    logistic_pipe.fit(X_train,y_train)
    graph_X.append(i)
    graph_Y.append(logistic_pipe.score(X_test,y_test))
    
In [12]:
    
plt.xticks(graph_X)
plt.xlabel("Number of components")
plt.ylabel("Accuracy")
plt.plot(graph_X,graph_Y)
plt.show()
    
    
In [13]:
    
import numpy as np
    
In [17]:
    
from sklearn.cross_validation import StratifiedKFold
    
In [18]:
    
kfold = StratifiedKFold(y=y_train.reshape(155),n_folds=5,random_state=1)
    
In [19]:
    
scores = []
    
In [20]:
    
logistic_pipe_kfold = Pipeline([('scl', StandardScaler()),('pca', PCA(n_components=5)),('clf',LogisticRegression(random_state=1))])
    
In [21]:
    
for k, (train, test) in enumerate(kfold):
    logistic_pipe_kfold.fit(X_train[train], y_train[train])
    score = logistic_pipe_kfold.score(X_train[test], y_train[test])
    scores.append(score)
    
In [22]:
    
for i,score in enumerate(scores):
    print("%s: %.3f" %(i+1,score))
    
    
In [23]:
    
np.mean(scores)
    
    Out[23]:
In [24]:
    
np.std(scores)
    
    Out[24]:
In [25]:
    
from sklearn.grid_search import GridSearchCV
    
    
In [26]:
    
from sklearn.svm import SVC
    
In [27]:
    
pipe_svc = Pipeline([('scl',StandardScaler()),('clf',SVC(random_state=1))])
    
In [28]:
    
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
    
In [29]:
    
param_grid = [{'clf__C':param_range,'clf__kernel':['linear']},{'clf__C':param_range,'clf__gamma':param_range,'clf__kernel':['rbf']}]
    
In [30]:
    
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    
In [31]:
    
svm_gs = gs.fit(X_train,y_train)
    
In [32]:
    
svm_gs.best_score_
    
    Out[32]:
In [33]:
    
svm_gs.best_params_
    
    Out[33]:
In [34]:
    
svm_gs.score(X_test,y_test)
    
    Out[34]:
In [35]:
    
from sklearn.metrics import confusion_matrix
    
In [36]:
    
y_pred = svm_gs.predict(X_test)
    
In [37]:
    
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    
In [38]:
    
import seaborn as sns
    
In [39]:
    
ax = sns.heatmap(confmat,annot=True)
    
In [40]:
    
plt.show()
    
    
In [87]:
    
from sklearn.tree import DecisionTreeClassifier
    
In [89]:
    
dtree_gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),param_grid=[{'max_depth': [1,2,3,4,5,6,7,None]}],scoring='accuracy',cv=5,n_jobs=-1)
    
In [90]:
    
from sklearn.cross_validation import cross_val_score
    
In [91]:
    
scores = cross_val_score(dtree_gs,X_train,y_train,scoring='accuracy',cv=5)
    
In [93]:
    
print(scores)
    
    
In [94]:
    
np.mean(scores), np.std(scores)
    
    Out[94]:
In [96]:
    
dtree_gs.fit(X_train,y_train)
    
    Out[96]:
In [97]:
    
dtree_gs.best_params_
    
    Out[97]:
In [98]:
    
dtree_gs.score(X_test,y_test)
    
    Out[98]:
In [99]:
    
from sklearn.metrics import confusion_matrix
    
In [100]:
    
y_pred = dtree_gs.predict(X_test)
    
In [101]:
    
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    
In [102]:
    
confmat
    
    Out[102]:
In [103]:
    
import seaborn as sns
    
In [104]:
    
sns.set()
    
In [108]:
    
ax = sns.heatmap(confmat, annot=True)
    
In [110]:
    
plt.show()
    
    
In [41]:
    
from sklearn.neighbors import KNeighborsClassifier
    
In [49]:
    
pipe_knn = Pipeline([('scl',StandardScaler()),('clf',KNeighborsClassifier())])
    
In [50]:
    
param_grid = [{'clf__n_neighbors':[3,4,5,6,7,8,9,10],'clf__p':[1,2]}]
    
In [51]:
    
gs_knn = GridSearchCV(estimator=pipe_knn, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    
In [52]:
    
gs_knn.fit(X_train,y_train)
    
    Out[52]:
In [53]:
    
gs_knn.best_params_
    
    Out[53]:
In [54]:
    
gs_knn.best_score_
    
    Out[54]:
In [55]:
    
y_pred = gs_knn.predict(X_test)
    
In [56]:
    
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    
In [57]:
    
ax = sns.heatmap(confmat, annot=True)
    
In [58]:
    
plt.show()
    
    
In [59]:
    
gs_knn.score(X_test,y_test)
    
    Out[59]:
In [60]:
    
from keras.models import Sequentialuential
    
In [61]:
    
from keras.layers import Dense
    
In [63]:
    
model = Sequential()
model.add(Dense(12,input_dim=22,activation='relu'))
model.add(Dense(8,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
    
In [64]:
    
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
    
In [65]:
    
model.summary()
    
    
In [66]:
    
model.fit(X_train,y_train, epochs=100, batch_size=10)
    
    
    Out[66]:
In [68]:
    
scores = model.evaluate(X_test,y_test)
    
    
In [69]:
    
scores[1]
    
    Out[69]:
In [71]:
    
y_pred = model.predict(X_test)
    
In [78]:
    
y_pred = [round(y[0]) for y in y_pred]
    
In [80]:
    
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
    
In [81]:
    
ax = sns.heatmap(confmat, annot=True)
    
In [82]:
    
plt.show()
    
    
In [ ]: