In [ ]:
"""
Work inspired / copied from here:
http://www.quora.com/What-is-the-future-of-data-science-1
"""

In [7]:
"""
Dataset found here:
http://forge.scilab.org/index.php/p/rdataset/source/tree/master/datasets.csv
many other interesting datasets here too!!!
"""


Out[7]:
'\nDataset found here:\nhttp://forge.scilab.org/index.php/p/rdataset/source/tree/master/datasets.csv\nmany other interesting datasets here too!!!\n'

In [14]:
import pandas as pd
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt

In [6]:
affairs = "https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/Ecdat/Fair.csv"
df = pd.read_csv(affairs)   
df = df.drop(df.columns[0], axis=1)
df.head()


Out[6]:
Unnamed: 0 sex age ym child religious education occupation rate nbaffairs
0 1 male 37 10.00 no 3 18 7 4 0
1 2 female 27 4.00 no 4 14 6 4 0
2 3 female 32 15.00 yes 1 12 1 4 0
3 4 male 57 15.00 yes 5 18 6 5 0
4 5 male 22 0.75 no 2 17 6 3 0

In [13]:
df.head()


Out[13]:
sex age ym child religious education occupation rate nbaffairs
0 male 37 10.00 no 3 18 7 4 0
1 female 27 4.00 no 4 14 6 4 0
2 female 32 15.00 yes 1 12 1 4 0
3 male 57 15.00 yes 5 18 6 5 0
4 male 22 0.75 no 2 17 6 3 0

In [15]:
sns.jointplot(x="nbaffairs", y="age", data=df[df.sex=="male"], kind="reg", size=4);



In [16]:
sns.jointplot(x="nbaffairs", y="age", data=df[df.sex=="female"], kind="reg", size=4);



In [ ]:
# For the shake of the classification pbm we create the target affair , boolean true if even 1 affair

In [20]:
df["affair"] = df.nbaffairs>0

In [21]:
# And now we want to rank the variables using as target the affair


Out[21]:
sex age ym child religious education occupation rate nbaffairs affair
0 male 37 10.00 no 3 18 7 4 0 False
1 female 27 4.00 no 4 14 6 4 0 False
2 female 32 15.00 yes 1 12 1 4 0 False
3 male 57 15.00 yes 5 18 6 5 0 False
4 male 22 0.75 no 2 17 6 3 0 False

In [ ]:
def entropy(data, target_attr):
    """
    Calculates the entropy of the given data set for the target attribute.
    """
    val_freq     = {}
    data_entropy = 0.0

    # Calculate the frequency of each of the values in the target attr
    for record in data:
        if (val_freq.has_key(record[target_attr])):
            val_freq[record[target_attr]] += 1.0
        else:
            val_freq[record[target_attr]]  = 1.0

    # Calculate the entropy of the data for the target attribute
    for freq in val_freq.values():
        data_entropy += (-freq/len(data)) * math.log(freq/len(data), 2) 
        
    return data_entropy

In [ ]:
def gain(data, attr, target_attr):
    """
    Calculates the information gain (reduction in entropy) that would
    result by splitting the data on the chosen attribute (attr).
    """
    val_freq       = {}
    subset_entropy = 0.0

    # Calculate the frequency of each of the values in the target attribute
    for record in data:
        if (val_freq.has_key(record[attr])):
            val_freq[record[attr]] += 1.0
        else:
            val_freq[record[attr]]  = 1.0

    # Calculate the sum of the entropy for each subset of records weighted
    # by their probability of occuring in the training set.
    for val in val_freq.keys():
        val_prob        = val_freq[val] / sum(val_freq.values())
        data_subset     = [record for record in data if record[attr] == val]
        subset_entropy += val_prob * entropy(data_subset, target_attr)

    # Subtract the entropy of the chosen attribute from the entropy of the
    # whole data set with respect to the target attribute (and return it)
    return (entropy(data, target_attr) - subset_entropy)