In [1]:
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
train_df = pd.read_csv("training_data.txt")
In [3]:
train_label_df = pd.read_csv("training_labels.txt")
In [4]:
test_df = pd.read_csv("testing_data.txt")
In [5]:
test_label_df = pd.read_csv("testing_labels.txt")
In [6]:
X_train = train_df.iloc[:,:22].values
In [7]:
y_train = train_label_df.iloc[:].values
y_train = y_train.reshape(155)
In [8]:
X_test = test_df.iloc[:,:22].values
In [9]:
y_test = test_label_df.iloc[:].values
y_test = y_test.reshape(40)
In [10]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
In [11]:
graph_X = []
graph_Y = []
for i in range(2,22):
logistic_pipe = Pipeline([('scl', StandardScaler()),('pca', PCA(n_components=i)),('clf',LogisticRegression(random_state=1))])
logistic_pipe.fit(X_train,y_train)
graph_X.append(i)
graph_Y.append(logistic_pipe.score(X_test,y_test))
In [12]:
plt.xticks(graph_X)
plt.xlabel("Number of components")
plt.ylabel("Accuracy")
plt.plot(graph_X,graph_Y)
plt.show()
In [13]:
import numpy as np
In [17]:
from sklearn.cross_validation import StratifiedKFold
In [18]:
kfold = StratifiedKFold(y=y_train.reshape(155),n_folds=5,random_state=1)
In [19]:
scores = []
In [20]:
logistic_pipe_kfold = Pipeline([('scl', StandardScaler()),('pca', PCA(n_components=5)),('clf',LogisticRegression(random_state=1))])
In [21]:
for k, (train, test) in enumerate(kfold):
logistic_pipe_kfold.fit(X_train[train], y_train[train])
score = logistic_pipe_kfold.score(X_train[test], y_train[test])
scores.append(score)
In [22]:
for i,score in enumerate(scores):
print("%s: %.3f" %(i+1,score))
In [23]:
np.mean(scores)
Out[23]:
In [24]:
np.std(scores)
Out[24]:
In [25]:
from sklearn.grid_search import GridSearchCV
In [26]:
from sklearn.svm import SVC
In [27]:
pipe_svc = Pipeline([('scl',StandardScaler()),('clf',SVC(random_state=1))])
In [28]:
param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
In [29]:
param_grid = [{'clf__C':param_range,'clf__kernel':['linear']},{'clf__C':param_range,'clf__gamma':param_range,'clf__kernel':['rbf']}]
In [30]:
gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
In [31]:
svm_gs = gs.fit(X_train,y_train)
In [32]:
svm_gs.best_score_
Out[32]:
In [33]:
svm_gs.best_params_
Out[33]:
In [34]:
svm_gs.score(X_test,y_test)
Out[34]:
In [35]:
from sklearn.metrics import confusion_matrix
In [36]:
y_pred = svm_gs.predict(X_test)
In [37]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
In [38]:
import seaborn as sns
In [39]:
ax = sns.heatmap(confmat,annot=True)
In [40]:
plt.show()
In [87]:
from sklearn.tree import DecisionTreeClassifier
In [89]:
dtree_gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),param_grid=[{'max_depth': [1,2,3,4,5,6,7,None]}],scoring='accuracy',cv=5,n_jobs=-1)
In [90]:
from sklearn.cross_validation import cross_val_score
In [91]:
scores = cross_val_score(dtree_gs,X_train,y_train,scoring='accuracy',cv=5)
In [93]:
print(scores)
In [94]:
np.mean(scores), np.std(scores)
Out[94]:
In [96]:
dtree_gs.fit(X_train,y_train)
Out[96]:
In [97]:
dtree_gs.best_params_
Out[97]:
In [98]:
dtree_gs.score(X_test,y_test)
Out[98]:
In [99]:
from sklearn.metrics import confusion_matrix
In [100]:
y_pred = dtree_gs.predict(X_test)
In [101]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
In [102]:
confmat
Out[102]:
In [103]:
import seaborn as sns
In [104]:
sns.set()
In [108]:
ax = sns.heatmap(confmat, annot=True)
In [110]:
plt.show()
In [41]:
from sklearn.neighbors import KNeighborsClassifier
In [49]:
pipe_knn = Pipeline([('scl',StandardScaler()),('clf',KNeighborsClassifier())])
In [50]:
param_grid = [{'clf__n_neighbors':[3,4,5,6,7,8,9,10],'clf__p':[1,2]}]
In [51]:
gs_knn = GridSearchCV(estimator=pipe_knn, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
In [52]:
gs_knn.fit(X_train,y_train)
Out[52]:
In [53]:
gs_knn.best_params_
Out[53]:
In [54]:
gs_knn.best_score_
Out[54]:
In [55]:
y_pred = gs_knn.predict(X_test)
In [56]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
In [57]:
ax = sns.heatmap(confmat, annot=True)
In [58]:
plt.show()
In [59]:
gs_knn.score(X_test,y_test)
Out[59]:
In [60]:
from keras.models import Sequentialuential
In [61]:
from keras.layers import Dense
In [63]:
model = Sequential()
model.add(Dense(12,input_dim=22,activation='relu'))
model.add(Dense(8,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
In [64]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
In [65]:
model.summary()
In [66]:
model.fit(X_train,y_train, epochs=100, batch_size=10)
Out[66]:
In [68]:
scores = model.evaluate(X_test,y_test)
In [69]:
scores[1]
Out[69]:
In [71]:
y_pred = model.predict(X_test)
In [78]:
y_pred = [round(y[0]) for y in y_pred]
In [80]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_pred)
In [81]:
ax = sns.heatmap(confmat, annot=True)
In [82]:
plt.show()
In [ ]: