In [31]:
def warn(*args, **kwargs):
pass
import warnings
warnings.warn = warn
In [4]:
#GLASS DATASET IMPORT
import pandas as pd
import numpy as np
columns=['id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type']
dataset = pd.read_csv('glass.txt',names=columns)
dataset['Type'] = dataset['Type'].astype(int)
labels = dataset['Type'].unique()
yColumn = len(columns) -1
trainColumns = range(yColumn)
X = np.array(dataset.drop(['Type'], 1))
y = np.array(dataset['Type'])
In [5]:
dataset['Type'].value_counts()
Out[5]:
In [ ]:
#breast cancer dataset IMPORT
import pandas as pd
import numpy as np
columns = ['code_num','thickness','uofcsize','uofcshape','adhesion','secsize','bnuclei','chromatinb','\
nnucleoi','mitoses','output']
data = pd.read_csv('breast-cancer-wisconsin.data',names=columns)
data['output'] = data['output'].astype(int)
data.drop(['code_num'],1,inplace=True)
data.replace('?',-99999, inplace=True)
data = data.astype(int)
X = np.array(data.drop(['output'], 1))
y = np.array(data['output'])
yColumn =9
trainColumns = range(yColumn)
data['output'].value_counts()
In [6]:
from sklearn.utils import resample
df_majority = dataset[dataset.Type==2]
df_minority1 = dataset[dataset.Type==1]
df_minority7 = dataset[dataset.Type==7]
df_minority3 = dataset[dataset.Type==3]
df_minority5 = dataset[dataset.Type==5]
df_minority6 = dataset[dataset.Type==6]
df_minority_upsampled1 = resample(df_minority1,
replace=True, # sample with replacement
n_samples=76, # to match majority class
random_state=123) # reproducible results
df_minority_upsampled7 = resample(df_minority7,
replace=True, # sample with replacement
n_samples=76, # to match majority class
random_state=123) # reproducible results
df_minority_upsampled3 = resample(df_minority3,
replace=True, # sample with replacement
n_samples=76, # to match majority class
random_state=123) # reproducible results
df_minority_upsampled5 = resample(df_minority5,
replace=True, # sample with replacement
n_samples=76, # to match majority class
random_state=123) # reproducible results
df_minority_upsampled6 = resample(df_minority6,
replace=True, # sample with replacement
n_samples=76, # to match majority class
random_state=123) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled1,df_minority_upsampled7,df_minority_upsampled3,df_minority_upsampled5,df_minority_upsampled6])
print(df_upsampled.Type.value_counts())
X = np.array(df_upsampled.drop(['Type'], 1))
y = np.array(df_upsampled['Type'])
In [ ]:
from sklearn.utils import resample
df_majority = data[data.output==2]
df_minority = data[data.output==4]
df_minority_upsampled = resample(df_minority,
replace=True, # sample with replacement
n_samples=458, # to match majority class
random_state=123) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.output.value_counts()
X = np.array(df_upsampled.drop(['output'], 1))
y = np.array(df_['output'])
In [ ]:
#SONAR DATASET IMPORT
import pandas as pd
from collections import Counter
data = pd.read_csv('Sonar.csv')
print(data.columns,data.Class.value_counts())
yColumn = 60
trainColumns = range(yColumn)
In [ ]:
from sklearn.utils import resample
import numpy as np
df_majority = data[data.Class==0]
df_minority = data[data.Class==1]
df_minority_upsampled = resample(df_minority,
replace=True, # sample with replacement
n_samples=111, # to match majority class
random_state=123) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.Class.value_counts()
X = np.array(df_upsampled.drop(['Class'], 1))
y = np.array(df_upsampled['Class'])
In [3]:
# load isolet data
import pandas
import numpy as np
dTrain = pandas.io.parsers.read_csv('isolet1+2+3+4.data.gz',compression='gzip',header=None)
yColumn = 617
trainColumns = range(yColumn)
dTrain[617] = dTrain[617].astype(int)
dTrain[617].value_counts()
Out[3]:
In [4]:
from sklearn.utils import resample
import numpy as np
df_majority = dTrain[dTrain[617]!=6]
df_minority = dTrain[dTrain[617]==6]
df_minority_upsampled = resample(df_minority,
replace=True, # sample with replacement
n_samples=240, # to match majority class
random_state=123) # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pandas.concat([df_majority, df_minority_upsampled])
X = np.array(df_upsampled.drop([617], 1))
y = np.array(df_upsampled[617])
In [5]:
def k_fold_cross_validation(X, K, randomise = False):
"""
Generates K (training, validation) pairs from the items in X.
Each pair is a partition of X, where validation is an iterable
of length len(X)/K. So each training iterable is of length (K-1)*len(X)/K.
If randomise is true, a copy of X is shuffled before partitioning,
otherwise its order is preserved in training and validation.
"""
if randomise: from random import shuffle; X=list(X); shuffle(X)
for k in list(range(K)):
training = [x for i, x in enumerate(X) if i % K != k]
validation = [x for i, x in enumerate(X) if i % K == k]
yield training, validation
In [69]:
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.89, random_state=42,stratify = y)
X_train.shape,y_train.shape
Out[69]:
In [24]:
Cou
Out[24]:
In [57]:
from collections import Counter
Counter(y_train)
Out[57]:
In [70]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC(probability=True)
clf3 = LogisticRegression()
clf4 = tree.DecisionTreeClassifier()
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)
clf4.fit(X_train,y_train)
accuracy_1 = clf1.score(X_test, y_test)
accuracy_2 = clf2.score(X_test, y_test)
accuracy_3 = clf3.score(X_test,y_test)
accuracy_4 = clf4.score(X_test,y_test)
print(accuracy_1,accuracy_2,accuracy_3,accuracy_4)
In [ ]:
In [ ]:
y_test = y_test.astype(int)
In [26]:
import operator
def generate_preference(pred_proba,labels):
'''
Accepts:
pred_proba: Numpy array containing probabilities
labels: list containing output labels
Returns: Preference in form of list of list
'''
pred_proba = pred_proba[0]
num_class = pred_proba.shape[0]
vote_dic = {}
for i in range(num_class):
vote_dic[labels[i]] = pred_proba[i]
sorted_x = sorted(vote_dic.items(), key=operator.itemgetter(1))
sorted_x.reverse()
preference = []
for i in range(num_class):
preference.append(sorted_x[i][0])
return list(preference)
In [27]:
def borda(preference_ballot):
'''
Accepts: list of list => preference_ballot
Returns: Winner
'''
counts = {}
candidates = list(set(preference_ballot[0]))
max_point = len(candidates)
for i in range(max_point):
counts[candidates[i]] = 0
for pref in preference_ballot:
for i in range(len(pref)):
counts[pref[i]] += (max_point -i)
return int(max(counts, key=counts.get))
In [28]:
def get_prediction_borda(test_example):
ensemble = [clf1,clf2,clf3]
labels = list(clf1.classes_)
preference_ballot = []
for base_learner in ensemble:
preference_ballot.append(generate_preference(base_learner.predict_proba(test_example),labels))
return borda(preference_ballot)
In [29]:
from collections import Counter
def get_prediction_majority(test_example):
ensemble = [clf1,clf2,clf3]
predictions = []
for base_learner in ensemble:
predictions.append(base_learner.predict(test_example)[0])
occ = Counter(predictions)
return int(max(occ,key=occ.get))
In [71]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
predictions_borda,predictions_majority = [],[]
for test_example in X_test:
predictions_borda.append(get_prediction_borda(test_example))
predictions_majority.append(get_prediction_majority(test_example))
print('Accuracy with Borda Count: ',accuracy_score(y_test,predictions_borda))
print('Accuracy with Majority Voting',accuracy_score(y_test,predictions_majority))
print('F-1 score of Borda Count',f1_score(y_test,predictions_borda,average='macro'))
print('F-1 score of Majority Voting Classifier',f1_score(y_test,predictions_majority,average='macro'))
In [34]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import accuracy_score
import numpy as np
import statistics
acc_borda = []
acc_majority = []
acc_KNN = []
acc_svc = []
acc_dt = []
for training, validation in k_fold_cross_validation(X_train, K=10):
training = np.array(training)
validation = np.array(validation)
X_train = [x for x in training[:,trainColumns]]
y_train = [y for y in training[:,yColumn]]
X_val = [x for x in validation[:,trainColumns]]
y_val = [y for y in validation[:,yColumn]]
clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC(probability=True)
clf3 = tree.DecisionTreeClassifier()
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)
acc_KNN.append(clf1.score(X_val, y_val))
acc_svc.append(clf2.score(X_val, y_val))
acc_dt.append(clf3.score(X_val,y_val))
predictions_borda,predictions_majority = [],[]
for test_example in X_val:
predictions_borda.append(get_prediction_borda(test_example))
predictions_majority.append(get_prediction_majority(test_example))
acc_borda.append(accuracy_score(y_val,predictions_borda))
acc_majority.append(accuracy_score(y_val,predictions_majority))
print('$',round(statistics.mean(acc_KNN)*100,2),'\pm',round(statistics.stdev(acc_KNN)*100,2),'$')
print('$',round(statistics.mean(acc_svc)*100,2),'\pm',round(statistics.stdev(acc_svc)*100,2),'$')
print('$',round(statistics.mean(acc_dt)*100,2),'\pm',round(statistics.stdev(acc_dt)*100,2),'$')
print('$',round(statistics.mean(acc_borda)*100,2),'\pm',round(statistics.stdev(acc_borda)*100,2),'$')
print('$',round(statistics.mean(acc_majority)*100,2),'\pm',round(statistics.stdev(acc_majority)*100,2),'$')
In [37]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import f1_score
import numpy as np
clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC(probability=True)
clf3 = tree.DecisionTreeClassifier()
y_train=X_train[:,-1]
X_train=X_train[:,:-1]
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)
print(round(clf1.score(X_test,y_test)*100,2))
print(round(clf2.score(X_test,y_test)*100,2))
print(round(clf3.score(X_test,y_test)*100,2))
predictions_borda = []
predictions_majority = []
for test_example in X_test:
predictions_borda.append(get_prediction_borda(test_example))
predictions_majority.append(get_prediction_majority(test_example))
In [38]:
from sklearn.metrics import accuracy_score
print(round(accuracy_score(y_test,predictions_borda)*100,2))
print(round(accuracy_score(y_test,predictions_majority)*100,2))
In [ ]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import f1_score
import numpy as np
clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC(probability=True)
clf3 = tree.DecisionTreeClassifier()
y_train=X_train[:,-1]
X_train=X_train[:,:-1]
clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)
In [ ]:
Counter(y_test)
In [ ]:
probs = model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)
# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()