notebook.community

Edit and run



In [ ]:

    
"""
Work inspired / copied from here:
http://www.quora.com/What-is-the-future-of-data-science-1
"""



In [7]:

    
"""
Dataset found here:
http://forge.scilab.org/index.php/p/rdataset/source/tree/master/datasets.csv
many other interesting datasets here too!!!
"""









    Out[7]:





'\nDataset found here:\nhttp://forge.scilab.org/index.php/p/rdataset/source/tree/master/datasets.csv\nmany other interesting datasets here too!!!\n'



In [14]:

    
import pandas as pd
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt



In [6]:

    
affairs = "https://raw.github.com/vincentarelbundock/Rdatasets/master/csv/Ecdat/Fair.csv"
df = pd.read_csv(affairs)   
df = df.drop(df.columns[0], axis=1)
df.head()



In [13]:

    
df.head()



In [15]:

    
sns.jointplot(x="nbaffairs", y="age", data=df[df.sex=="male"], kind="reg", size=4);



In [16]:

    
sns.jointplot(x="nbaffairs", y="age", data=df[df.sex=="female"], kind="reg", size=4);



In [ ]:

    
# For the shake of the classification pbm we create the target affair , boolean true if even 1 affair



In [20]:

    
df["affair"] = df.nbaffairs>0



In [21]:

    
# And now we want to rank the variables using as target the affair



In [ ]:

    
def entropy(data, target_attr):
    """
    Calculates the entropy of the given data set for the target attribute.
    """
    val_freq     = {}
    data_entropy = 0.0

    # Calculate the frequency of each of the values in the target attr
    for record in data:
        if (val_freq.has_key(record[target_attr])):
            val_freq[record[target_attr]] += 1.0
        else:
            val_freq[record[target_attr]]  = 1.0

    # Calculate the entropy of the data for the target attribute
    for freq in val_freq.values():
        data_entropy += (-freq/len(data)) * math.log(freq/len(data), 2) 
        
    return data_entropy



In [ ]:

    
def gain(data, attr, target_attr):
    """
    Calculates the information gain (reduction in entropy) that would
    result by splitting the data on the chosen attribute (attr).
    """
    val_freq       = {}
    subset_entropy = 0.0

    # Calculate the frequency of each of the values in the target attribute
    for record in data:
        if (val_freq.has_key(record[attr])):
            val_freq[record[attr]] += 1.0
        else:
            val_freq[record[attr]]  = 1.0

    # Calculate the sum of the entropy for each subset of records weighted
    # by their probability of occuring in the training set.
    for val in val_freq.keys():
        val_prob        = val_freq[val] / sum(val_freq.values())
        data_subset     = [record for record in data if record[attr] == val]
        subset_entropy += val_prob * entropy(data_subset, target_attr)

    # Subtract the entropy of the chosen attribute from the entropy of the
    # whole data set with respect to the target attribute (and return it)
    return (entropy(data, target_attr) - subset_entropy)

	Unnamed: 0	sex	age	ym	child	religious	education	occupation	rate
0	1	male	37	10.00	no	3	18	7	4
1	2	female	27	4.00	no	4	14	6	4
2	3	female	32	15.00	yes	1	12	1	4
3	4	male	57	15.00	yes	5	18	6	5
4	5	male	22	0.75	no	2	17	6	3

	sex	age	ym	child	religious	education	occupation	rate
0	male	37	10.00	no	3	18	7	4
1	female	27	4.00	no	4	14	6	4
2	female	32	15.00	yes	1	12	1	4
3	male	57	15.00	yes	5	18	6	5
4	male	22	0.75	no	2	17	6	3

	sex	age	ym	child	religious	education	occupation	rate	affair
0	male	37	10.00	no	3	18	7	4	False
1	female	27	4.00	no	4	14	6	4	False
2	female	32	15.00	yes	1	12	1	4	False
3	male	57	15.00	yes	5	18	6	5	False
4	male	22	0.75	no	2	17	6	3	False