In [3]:
import urllib2
import numpy
import random
from sklearn import datasets, linear_model
def confusionMatrix(predicted, actual, threshold):
if len(predicted) != len(actual): return -1
tp = 0.0
fp = 0.0
tn = 0.0
fn = 0.0
for i in range(len(actual)):
if actual[i] > 0.5: #labels that are 1.0 (positive examples)
if predicted[i] > threshold:
tp += 1.0 #correctly predicted positive
else:
fn += 1.0 #incorrectly predicted negative
else: #labels that are 0.0 (negative examples)
if predicted[i] < threshold:
tn += 1.0 #correctly predicted negative
else:
fp += 1.0 #incorrectly predicted positive
rtn = [tp, fn, fp, tn]
return rtn
#read data from uci data repository
target_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data"
data = urllib2.urlopen(target_url)
#arrange data into list for labels and list of lists for attributes
xList = []
labels = []
for line in data:
#split on comma
row = line.strip().split(",")
#assign label 1.0 for "M" and 0.0 for "R"
if(row[-1] == 'M'):
labels.append(1.0)
else:
labels.append(0.0)
#remove label from row
row.pop()
#convert row to floats
floatRow = [float(num) for num in row]
xList.append(floatRow)
#divide attribute matrix and label vector into training(2/3 of data) and test sets (1/3 of data)
indices = range(len(xList))
xListTest = [xList[i] for i in indices if i%3 == 0 ]
xListTrain = [xList[i] for i in indices if i%3 != 0 ]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]
#form list of list input into numpy arrays to match input class for scikit-learn linear model
xTrain = numpy.array(xListTrain)
yTrain = numpy.array(labelsTrain)
xTest = numpy.array(xListTest)
yTest = numpy.array(labelsTest)
#train linear regression model
rocksVMinesModel = linear_model.LinearRegression()
rocksVMinesModel.fit(xTrain,yTrain)
#generate predictions on in-sample error
trainingPredictions = rocksVMinesModel.predict(xTrain)
#generate confusion matrix for predictions on training set (in-sample)
#The optimal threshold will be reached, when the accuracy of the model is the highest. Otherwise you can also use the ROC curve
confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5)
#pick threshold value and generate confusion matrix entries
tp = confusionMatTrain[0]; fn = confusionMatTrain[1]; fp = confusionMatTrain[2]; tn = confusionMatTrain[3]
print("Results for predictions in-sample data:")
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
#generate predictions on out-of-sample data
testPredictions = rocksVMinesModel.predict(xTest)
#generate confusion matrix from predictions on out-of-sample data
conMatTest = confusionMatrix(testPredictions, yTest, 0.5)
#pick threshold value and generate confusion matrix entries
tp = conMatTest[0]; fn = conMatTest[1]; fp = conMatTest[2]; tn = conMatTest[3]
print("Results for predictions out-of-sample data:")
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
In [ ]: