In [44]:
import math
import random
# Here the goal is to partition iris.data into a training and test set.
species_conv = {'Iris-setosa': 1, 'Iris-versicolor': 2, 'Iris-virginica': 3}
def convertLine(attr1, attr2, attr3, attr4, label):
return [attr1, attr2, attr3, attr4, species_conv[label]]
def irisArrayToString(arr):
return (",".join([str(elm) for elm in arr])) + "\n"
In [45]:
TESTS_PER_CLASS = 10
with open('bezdekIris.data', 'r') as data, \
open("iris_test.data", "w") as test_file, \
open("iris_train.data", "w") as train_file:
converted_data = [convertLine(*str.split(str.strip(line_uc), ",")) for line_uc in data]
classes = [1, 2, 3]
# Parition data into classes, to ensure even break between the test and training file
converted_data_1, converted_data_2, converted_data_3 = [[datum for datum in converted_data if datum[-1] == clazz]
for clazz in classes]
random.shuffle(converted_data_1)
random.shuffle(converted_data_2)
random.shuffle(converted_data_3)
converted_data_partitoned = [(elm1, elm2, elm3) for elm1, elm2, elm3 in zip(converted_data_1, converted_data_2, converted_data_3)]
for outSet in converted_data_partitoned[0:TESTS_PER_CLASS]:
for datum in outSet:
test_file.write(irisArrayToString(datum))
for outSet in converted_data_partitoned[TESTS_PER_CLASS:-1]:
for datum in outSet:
train_file.write(irisArrayToString(datum))
In [5]:
test = "5.7,2.8,4.1,1.3,Iris-versicolor"
str(convertLine(*str.split(test, ",")))
Out[5]: