In [44]:
import math
import random

# Here the goal is to partition iris.data into a training and test set.

species_conv = {'Iris-setosa': 1, 'Iris-versicolor': 2, 'Iris-virginica': 3}
def convertLine(attr1, attr2, attr3, attr4, label):
    return [attr1, attr2, attr3, attr4, species_conv[label]]

def irisArrayToString(arr):
    return (",".join([str(elm) for elm in arr])) + "\n"

In [45]:
TESTS_PER_CLASS = 10
with open('bezdekIris.data', 'r') as data, \
    open("iris_test.data", "w") as test_file, \
    open("iris_train.data", "w") as train_file:
        
    converted_data = [convertLine(*str.split(str.strip(line_uc), ",")) for line_uc in data]
    classes = [1, 2, 3]
    # Parition data into classes, to ensure even break between the test and training file
    converted_data_1, converted_data_2, converted_data_3 = [[datum for datum in converted_data if datum[-1] == clazz] 
                                                            for clazz in classes]
    random.shuffle(converted_data_1)
    random.shuffle(converted_data_2)
    random.shuffle(converted_data_3)
    
    converted_data_partitoned = [(elm1, elm2, elm3) for elm1, elm2, elm3 in zip(converted_data_1, converted_data_2, converted_data_3)]
    for outSet in converted_data_partitoned[0:TESTS_PER_CLASS]:
        for datum in outSet:
            test_file.write(irisArrayToString(datum))
    for outSet in converted_data_partitoned[TESTS_PER_CLASS:-1]:
        for datum in outSet:
            train_file.write(irisArrayToString(datum))

In [5]:
test = "5.7,2.8,4.1,1.3,Iris-versicolor"
str(convertLine(*str.split(test, ",")))


Out[5]:
'5.7,2.8,4.1,1.3,2\n'