In [31]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [4]:
#GLASS DATASET IMPORT
import pandas as pd
import numpy as np
columns=['id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type']
dataset = pd.read_csv('glass.txt',names=columns)
dataset['Type'] = dataset['Type'].astype(int)
labels = dataset['Type'].unique()
yColumn = len(columns) -1
trainColumns = range(yColumn)
X = np.array(dataset.drop(['Type'], 1))
y = np.array(dataset['Type'])

In [5]:
dataset['Type'].value_counts()


Out[5]:
2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

In [ ]:
#breast cancer dataset IMPORT
import pandas as pd
import numpy as np
columns = ['code_num','thickness','uofcsize','uofcshape','adhesion','secsize','bnuclei','chromatinb','\
			nnucleoi','mitoses','output']
data = pd.read_csv('breast-cancer-wisconsin.data',names=columns)
data['output'] = data['output'].astype(int)
data.drop(['code_num'],1,inplace=True)
data.replace('?',-99999, inplace=True)
data = data.astype(int)
X = np.array(data.drop(['output'], 1))
y = np.array(data['output'])
yColumn =9
trainColumns = range(yColumn)
data['output'].value_counts()

In [6]:
from sklearn.utils import resample
df_majority = dataset[dataset.Type==2]
df_minority1 = dataset[dataset.Type==1]
df_minority7 = dataset[dataset.Type==7]
df_minority3 = dataset[dataset.Type==3]
df_minority5 = dataset[dataset.Type==5]
df_minority6 = dataset[dataset.Type==6]


df_minority_upsampled1 = resample(df_minority1, 
                                 replace=True,     # sample with replacement
                                 n_samples=76,    # to match majority class
                                 random_state=123) # reproducible results
df_minority_upsampled7 = resample(df_minority7, 
                                 replace=True,     # sample with replacement
                                 n_samples=76,    # to match majority class
                                 random_state=123) # reproducible results
df_minority_upsampled3 = resample(df_minority3, 
                                 replace=True,     # sample with replacement
                                 n_samples=76,    # to match majority class
                                 random_state=123) # reproducible results
df_minority_upsampled5 = resample(df_minority5, 
                                 replace=True,     # sample with replacement
                                 n_samples=76,    # to match majority class
                                 random_state=123) # reproducible results
df_minority_upsampled6 = resample(df_minority6, 
                                 replace=True,     # sample with replacement
                                 n_samples=76,    # to match majority class
                                 random_state=123) # reproducible results
# Combine majority class with upsampled minority class

df_upsampled = pd.concat([df_majority, df_minority_upsampled1,df_minority_upsampled7,df_minority_upsampled3,df_minority_upsampled5,df_minority_upsampled6])
print(df_upsampled.Type.value_counts())
X = np.array(df_upsampled.drop(['Type'], 1))
y = np.array(df_upsampled['Type'])


7    76
6    76
5    76
3    76
2    76
1    76
Name: Type, dtype: int64

In [ ]:
from sklearn.utils import resample
df_majority = data[data.output==2]
df_minority = data[data.output==4]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=458,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.output.value_counts()
X = np.array(df_upsampled.drop(['output'], 1))
y = np.array(df_['output'])

In [ ]:
#SONAR DATASET IMPORT
import pandas as pd
from collections import Counter
data = pd.read_csv('Sonar.csv')
print(data.columns,data.Class.value_counts())
yColumn = 60
trainColumns = range(yColumn)

In [ ]:
from sklearn.utils import resample
import numpy as np
df_majority = data[data.Class==0]
df_minority = data[data.Class==1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=111,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
df_upsampled.Class.value_counts()
X = np.array(df_upsampled.drop(['Class'], 1))
y = np.array(df_upsampled['Class'])

In [3]:
# load isolet data
import pandas
import numpy as np
dTrain = pandas.io.parsers.read_csv('isolet1+2+3+4.data.gz',compression='gzip',header=None)
yColumn = 617
trainColumns = range(yColumn)
dTrain[617] = dTrain[617].astype(int)
dTrain[617].value_counts()


Out[3]:
23    240
19    240
8     240
12    240
16    240
20    240
24    240
1     240
5     240
9     240
13    240
17    240
21    240
25    240
2     240
10    240
14    240
18    240
22    240
26    240
3     240
7     240
11    240
15    240
4     240
6     238
Name: 617, dtype: int64

In [4]:
from sklearn.utils import resample
import numpy as np
df_majority = dTrain[dTrain[617]!=6]
df_minority = dTrain[dTrain[617]==6]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=240,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pandas.concat([df_majority, df_minority_upsampled])
X = np.array(df_upsampled.drop([617], 1))
y = np.array(df_upsampled[617])

In [5]:
def k_fold_cross_validation(X, K, randomise = False):
	"""
	Generates K (training, validation) pairs from the items in X.

	Each pair is a partition of X, where validation is an iterable
	of length len(X)/K. So each training iterable is of length (K-1)*len(X)/K.

	If randomise is true, a copy of X is shuffled before partitioning,
	otherwise its order is preserved in training and validation.
	"""
	if randomise: from random import shuffle; X=list(X); shuffle(X)
	for k in list(range(K)):
		training = [x for i, x in enumerate(X) if i % K != k]
		validation = [x for i, x in enumerate(X) if i % K == k]
		yield training, validation

In [69]:
from sklearn.model_selection import train_test_split
import numpy as np
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.89, random_state=42,stratify = y)
X_train.shape,y_train.shape


Out[69]:
((50, 10), (50,))

In [24]:
Cou


Out[24]:
3.85

In [57]:
from collections import Counter
Counter(y_train)


Out[57]:
Counter({1: 2, 2: 2, 3: 1, 5: 1, 6: 1, 7: 2})

In [70]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree

clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC(probability=True)
clf3 = LogisticRegression()
clf4 = tree.DecisionTreeClassifier()

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)
clf4.fit(X_train,y_train)

accuracy_1 = clf1.score(X_test, y_test)
accuracy_2 = clf2.score(X_test, y_test)
accuracy_3 = clf3.score(X_test,y_test)
accuracy_4 = clf4.score(X_test,y_test)
print(accuracy_1,accuracy_2,accuracy_3,accuracy_4)


(0.89901477832512311, 0.81527093596059108, 0.9211822660098522, 0.91133004926108374)

In [ ]:


In [ ]:
y_test = y_test.astype(int)

In [26]:
import operator
def generate_preference(pred_proba,labels):
    '''
    Accepts: 
        pred_proba: Numpy array containing probabilities
        labels: list containing output labels
    Returns: Preference in form of list of list
    '''
    pred_proba = pred_proba[0]
    num_class = pred_proba.shape[0]
    vote_dic = {}
    for i in range(num_class):
        vote_dic[labels[i]] = pred_proba[i]
    sorted_x = sorted(vote_dic.items(), key=operator.itemgetter(1))
    sorted_x.reverse()
    preference = []
    for i in range(num_class):
        preference.append(sorted_x[i][0])
    return list(preference)

In [27]:
def borda(preference_ballot):
    '''
    Accepts: list of list => preference_ballot
    Returns: Winner
    '''
    counts = {}
    candidates = list(set(preference_ballot[0]))
    max_point = len(candidates)
    for i in range(max_point):
        counts[candidates[i]] = 0
    for pref in preference_ballot:
        for i in range(len(pref)):
            counts[pref[i]] += (max_point -i)
    return int(max(counts, key=counts.get))

In [28]:
def get_prediction_borda(test_example):
    ensemble = [clf1,clf2,clf3]
    labels = list(clf1.classes_)
    preference_ballot = []
    for base_learner in ensemble:
        preference_ballot.append(generate_preference(base_learner.predict_proba(test_example),labels))
    return borda(preference_ballot)

In [29]:
from collections import Counter
def get_prediction_majority(test_example):
    ensemble = [clf1,clf2,clf3]
    predictions = []
    for base_learner in ensemble:
        predictions.append(base_learner.predict(test_example)[0])
    occ = Counter(predictions)
    return int(max(occ,key=occ.get))

In [71]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
predictions_borda,predictions_majority = [],[]
for test_example in X_test:
    predictions_borda.append(get_prediction_borda(test_example))
    predictions_majority.append(get_prediction_majority(test_example))
print('Accuracy with Borda Count: ',accuracy_score(y_test,predictions_borda))
print('Accuracy with Majority Voting',accuracy_score(y_test,predictions_majority))
print('F-1 score of Borda Count',f1_score(y_test,predictions_borda,average='macro'))
print('F-1 score of Majority Voting Classifier',f1_score(y_test,predictions_majority,average='macro'))


('Accuracy with Borda Count: ', 0.95320197044334976)
('Accuracy with Majority Voting', 0.89901477832512311)
('F-1 score of Borda Count', 0.95317662826622429)
('F-1 score of Majority Voting Classifier', 0.89689270238362406)

In [34]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import accuracy_score
import numpy as np
import statistics
acc_borda = []
acc_majority = []
acc_KNN = []
acc_svc = []
acc_dt = []
for training, validation in k_fold_cross_validation(X_train, K=10):
    training = np.array(training)
    validation = np.array(validation)
    X_train = [x for x in training[:,trainColumns]]
    y_train = [y for y in training[:,yColumn]]
    X_val = [x for x in validation[:,trainColumns]]
    y_val = [y for y in validation[:,yColumn]]
    
    clf1 = neighbors.KNeighborsClassifier()
    clf2 = svm.SVC(probability=True)
    clf3 = tree.DecisionTreeClassifier()

    clf1.fit(X_train, y_train)
    clf2.fit(X_train, y_train)
    clf3.fit(X_train,y_train)

    acc_KNN.append(clf1.score(X_val, y_val))
    acc_svc.append(clf2.score(X_val, y_val))
    acc_dt.append(clf3.score(X_val,y_val))
    
    predictions_borda,predictions_majority = [],[]
    for test_example in X_val:
        predictions_borda.append(get_prediction_borda(test_example))
        predictions_majority.append(get_prediction_majority(test_example))
    acc_borda.append(accuracy_score(y_val,predictions_borda))
    acc_majority.append(accuracy_score(y_val,predictions_majority))
print('$',round(statistics.mean(acc_KNN)*100,2),'\pm',round(statistics.stdev(acc_KNN)*100,2),'$')
print('$',round(statistics.mean(acc_svc)*100,2),'\pm',round(statistics.stdev(acc_svc)*100,2),'$')
print('$',round(statistics.mean(acc_dt)*100,2),'\pm',round(statistics.stdev(acc_dt)*100,2),'$')
print('$',round(statistics.mean(acc_borda)*100,2),'\pm',round(statistics.stdev(acc_borda)*100,2),'$')
print('$',round(statistics.mean(acc_majority)*100,2),'\pm',round(statistics.stdev(acc_majority)*100,2),'$')


$ 68.8 \pm 3.68 $
$ 48.72 \pm 6.6 $
$ 60.13 \pm 8.35 $
$ 67.97 \pm 8.15 $
$ 64.76 \pm 7.15 $

In [37]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import f1_score
import numpy as np
clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC(probability=True)
clf3 = tree.DecisionTreeClassifier()

y_train=X_train[:,-1]
X_train=X_train[:,:-1]

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)

print(round(clf1.score(X_test,y_test)*100,2))
print(round(clf2.score(X_test,y_test)*100,2))
print(round(clf3.score(X_test,y_test)*100,2))

predictions_borda = []
predictions_majority = []
for test_example in X_test:
        predictions_borda.append(get_prediction_borda(test_example))
        predictions_majority.append(get_prediction_majority(test_example))


74.41
78.67
60.56

In [38]:
from sklearn.metrics import accuracy_score
print(round(accuracy_score(y_test,predictions_borda)*100,2))
print(round(accuracy_score(y_test,predictions_majority)*100,2))


68.68
78.9

In [ ]:
from sklearn import preprocessing,neighbors,svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.metrics import f1_score
import numpy as np
clf1 = neighbors.KNeighborsClassifier()
clf2 = svm.SVC(probability=True)
clf3 = tree.DecisionTreeClassifier()

y_train=X_train[:,-1]
X_train=X_train[:,:-1]

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train,y_train)

In [ ]:
Counter(y_test)

In [ ]:
probs = model.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
roc_auc = metrics.auc(fpr, tpr)

# method I: plt
import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()