Kristaps Taube, kt11023

ld1


In [6]:
%matplotlib inline

In [91]:
import csv
# Dati no failiem

def get_attributes():
    # read attributes
    with open('./ld1/attributes.txt') as f:
        # `\d\\t\d\\n` format
        attributes = [int(a[0]) for a in csv.reader(f, delimiter='\t')]
    return attributes


def get_example_data():
    # read example line by line
    with open('./ld1/examples.txt') as f:
        # `\d\\t\d\\t ... \d\\n` format
        data = [tuple(i for i in map(lambda x: int(x), d))
                for d in zip(*csv.reader(f, delimiter='\t'))]
    return data

attributes = get_attributes()
data = get_example_data()
print(attributes)
data


[0, 1, 2, 3, 4]
Out[91]:
[(3, 3, 2, 3, 1, 1, 3, 2, 1, 1, 3, 2, 1, 3),
 (1, 2, 2, 2, 2, 2, 1, 1, 3, 3, 3, 3, 3, 1),
 (2, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2),
 (1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1),
 (1, 2, 2, 1, 3, 3, 1, 3, 3, 3, 1, 2, 3, 2)]

Palīgfunkcijas


In [92]:
import math

def unique(lst):
    """
    Returns a list made up of the unique values found in lst.  i.e., it
    removes the redundant values in lst.
    """
    lst = lst[:]
    unique_lst = []

    # Cycle through the list and add each value to the unique list only once.
    for item in lst:
        if unique_lst.count(item) <= 0:
            unique_lst.append(item)

    # Return the list with all redundant values removed.
    return unique_lst


def most_frequent(lst):
    """
    Returns the item that appears most frequently in the given list.
    """
    lst = lst[:]
    highest_freq = 0
    most_freq = None

    for val in unique(lst):
        if lst.count(val) > highest_freq:
            most_freq = val
            highest_freq = lst.count(val)

    return most_freq


def majority_value(data, target_attr):
    """
    Creates a list of all values in the target attribute for each record
    in the ld1 list object, and returns the value that appears in this list
    the most frequently.
    """
    data = data[:]
    return most_frequent([record[target_attr] for record in data])


def choose_attribute(data, attributes, target_attr, fitness):
    """
    Cycles through all the attributes and returns the attribute with the
    highest information gain (or lowest entropy).
    """
    data = data[:]
    best_gain = 0.0
    best_attr = None

    for attr in attributes:
        gains = fitness(data, attr, target_attr)
        if gains >= best_gain and attr != target_attr:
            best_gain = gains
            best_attr = attr

    return best_attr


def get_values(data, attr):
    """
    Creates a list of values in the chosen attribut for each record in ld1,
    prunes out all of the redundant values, and return the list.
    """
    data = data[:]
    return unique([record[attr] for record in data])


def get_examples(data, attr, value):
    """
    Returns a list of all the records in <ld1> with the value of <attr>
    matching the given value.
    """
    data = data[:]
    rtn_lst = []

    if not data:
        return rtn_lst
    else:
        record = data.pop()
        if record[attr] == value:
            rtn_lst.append(record)
            rtn_lst.extend(get_examples(data, attr, value))
            return rtn_lst
        else:
            rtn_lst.extend(get_examples(data, attr, value))
            return rtn_lst

Entropy un Gain aprēķināšanas implementācijas


In [93]:
def entropy(data, target_attr):
    """
    Calculates the entropy of the given ld1 set for the target attribute.
    """
    val_freq = {}
    data_entropy = 0.0

    # Calculate the frequency of each of the values in the target attr
    for record in data:
        if record[target_attr] in val_freq:
            val_freq[record[target_attr]] += 1.0
        else:
            val_freq[record[target_attr]] = 1.0

    # Calculate the entropy of the ld1 for the target attribute
    for freq in val_freq.values():
        data_entropy += (-freq / len(data)) * math.log(freq / len(data), 2)

    return data_entropy


def gain(data, attr, target_attr):
    """
    Calculates the information gain (reduction in entropy) that would
    result by splitting the ld1 on the chosen attribute (attr).
    """
    val_freq = {}
    subset_entropy = 0.0

    # Calculate the frequency of each of the values in the target attribute
    for record in data:
        if record[attr] in val_freq:
            val_freq[record[attr]] += 1.0
        else:
            val_freq[record[attr]] = 1.0

    # Calculate the sum of the entropy for each subset of records weighted
    # by their probability of occuring in the training set.
    for val in val_freq.keys():
        val_prob = val_freq[val] / sum(val_freq.values())
        data_subset = [record for record in data if record[attr] == val]
        subset_entropy += val_prob * entropy(data_subset, target_attr)

    # Subtract the entropy of the chosen attribute from the entropy of the
    # whole ld1 set with respect to the target attribute (and return it)
    return entropy(data, target_attr) - subset_entropy

In [94]:
def create_decision_tree(data, attributes, target_attr, fitness_func):
    vals = [record[target_attr] for record in data]
    default = majority_value(data, target_attr)

    if not data or (len(attributes) - 1) <= 0:
        return default

    # If all the records in the dataset have the same classification,
    # return that classification.
    elif vals.count(vals[0]) == len(vals):
        return vals[0]
    else:
        # Choose the next best attribute to best classify our ld1
        best = choose_attribute(data, attributes, target_attr,
                                fitness_func)
        
        attributes.remove(best)
        # Create a new decision tree/node with the best attribute and an empty
        # dictionary object--we'll fill that up next.
        tree = {best: {}}

        # Create a new decision tree/sub-node for each of the values in the
        # best attribute field
        for val in get_values(data, best):
            # Create a subtree for the current value under the "best" field
            subtree = create_decision_tree(
                get_examples(data, best, val),
                [attr for attr in attributes if attr != best],
                target_attr,
                fitness_func)

            # Add the new subtree to the empty dictionary object in our new
            # tree/node we just created.
            tree[best][val] = subtree

    return tree

In [95]:
tree = create_decision_tree(data, attributes, 0, gain)
import json

decode_attr = ['history', 'debt', 'recommendations', 'income']
decode_attrs = [
    ['bad', 'unknown', 'GOOD'],
    ['low', 'HIGH'],
    ['no', 'YES'],
    ['low', 'medium', 'HIGH']
]


def print_tree(tree, indent=1):
    for k, v in tree.items():
        if isinstance(v, dict):
            print(' {}↳ bar {}'.format(' ' * indent, decode_attr[k-1]))
            print_tree(v, indent+2)
        else:
            print('{}↳ bar {}  ->(*) {}'.format(' ' * indent, decode_attr[k-1], decode_attrs[3][v-1]))

print(json.dumps(tree, indent=4, sort_keys=True))

print_tree(tree)


{
    "3": {
        "1": {
            "4": {
                "1": {
                    "1": {
                        "1": 1,
                        "2": 2
                    }
                },
                "3": 1
            }
        },
        "2": 1,
        "3": 3
    }
}
  ↳ bar recommendations
    ↳ bar history
      ↳ bar income
        ↳ bar history
          ↳ bar history
           ↳ bar history  ->(*) low
           ↳ bar debt  ->(*) medium
       ↳ bar recommendations  ->(*) low
   ↳ bar debt  ->(*) low
   ↳ bar recommendations  ->(*) HIGH

In [45]: