In [757]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib

plt.rcParams["figure.figsize"] = [12, 9]
matplotlib.style.use('ggplot')

%matplotlib inline

Data exploration


In [758]:
def read_data(path):
    return pd.read_csv(path, 
                       index_col=False, 
                       skipinitialspace=True,
                       names=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
                           'marital_status', 'occupation', 'relationship', 'race', 'sex',
                           'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
                           'income']
                       )

In [759]:
train = read_data('./data/adult/adult.data')
test = read_data('./data/adult/adult.test')

In [760]:
train = train.append(test)
train.head()


Out[760]:
age workclass fnlwgt education education_num marital_status occupation relationship race sex capital_gain capital_loss hours_per_week native_country income
0 39 State-gov 77516 Bachelors 13 Never-married Adm-clerical Not-in-family White Male 2174 0 40 United-States <=50K
1 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse Exec-managerial Husband White Male 0 0 13 United-States <=50K
2 38 Private 215646 HS-grad 9 Divorced Handlers-cleaners Not-in-family White Male 0 0 40 United-States <=50K
3 53 Private 234721 11th 7 Married-civ-spouse Handlers-cleaners Husband Black Male 0 0 40 United-States <=50K
4 28 Private 338409 Bachelors 13 Married-civ-spouse Prof-specialty Wife Black Female 0 0 40 Cuba <=50K

In [761]:
train.hist(figsize=(12, 9))


Out[761]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x12c215490>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x129e6f850>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x12892a910>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x12c551bd0>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x122071d50>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x125bee490>]], dtype=object)

age, education_num, hours_per_week, fnlwgt seem like good candidates as features. Not much information in capital_gain, capital_loss.

Some routine stuff

  1. Convert objects to categories
  2. Drop duplicates
  3. Drop NA's - we can potentially impute these values. But always try out the simpler alternative before making it too complicated :)

In [762]:
# for column in train.select_dtypes(['object']).columns:
#     train[column] = train[column].astype('category')

## Check for duplicates, nulls
train.drop_duplicates(inplace=True)
train.dropna(inplace=True)

print any(train.duplicated())
print train.isnull().any()


False
age               False
workclass         False
fnlwgt            False
education         False
education_num     False
marital_status    False
occupation        False
relationship      False
race              False
sex               False
capital_gain      False
capital_loss      False
hours_per_week    False
native_country    False
income            False
dtype: bool

Let's clean some data


In [763]:
train.income.loc[train.income == '>50K.'] = '>50K'
train.income.loc[train.income == '<=50K.'] = '<=50K'
train.income.value_counts()


Out[763]:
<=50K    37128
>50K     11685
dtype: int64

Intuition 1:

Higher education should result in more income.


In [770]:
education_subset = train.groupby(['education_num', 'income']).size().reset_index()
education_subset.columns = ['education_num', 'income', 'count']
func = lambda x: float(x['count']) / train[train.education_num == x.education_num].count()[0]
education_subset['percentage'] = education_subset.apply(func, axis=1)
education_subset['education + income'] = education_subset.apply(lambda x: '%s, %s' % (x.education_num, x.income), axis=1)
education_subset.sort().plot(kind='barh', x='education + income', y='percentage', figsize=(12,12))


Out[770]:
<matplotlib.axes._subplots.AxesSubplot at 0x12e37e450>

Above plot shows percentage of population with respect to education and income, and it seems people with Masters and PhD tend to earn to more (more number of people are in >50K bucket).

Intuition 2:

People earn more as they get more experience.


In [786]:
train.groupby('income').hist(figsize=(15,12))


Out[786]:
income
<=50K    [[Axes(0.125,0.684722;0.336957x0.215278), Axes...
>50K     [[Axes(0.125,0.684722;0.336957x0.215278), Axes...
dtype: object

First plot shows distribution of age with respect to income <= 50K. Age is used as an proxy to experience. Assumption here is people continue to work as they age and acquire more skills in the process. As per intuition, number of people making less than 50K decreases as per age.

Second plot shows income > 50K. More interestingly, data shows a peak around 45. This indicates either there aren't enough poeple of age 45+ earning more than 50K in the data or income decreases as people approach retirement.

Feature construction


In [810]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [817]:
lencoder = LabelEncoder()
oencoder = OneHotEncoder()

In [843]:
features = pd.DataFrame()
features['age'] = train['age']
features['education_num'] = train['education_num']
features['hours_per_week'] = train['hours_per_week']
features['fnlwgt'] = train['fnlwgt']
features['sex'] = lencoder.fit_transform(train.sex)
features['occupation'] = lencoder.fit_transform(train.occupation)

features.income = train.income
features.income = lencoder.fit_transform(features.income)
features.head()


Out[843]:
age education_num hours_per_week fnlwgt sex occupation
0 39 13 40 77516 1 1
1 50 13 13 83311 1 4
2 38 9 40 215646 1 6
3 53 7 40 234721 1 6
4 28 13 40 338409 0 10

Model fitting


In [836]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split

In [837]:
x_train, x_test, y_train, y_test = train_test_split(features.drop('income'), features.income)

In [838]:
model = RandomForestClassifier()
model.fit(x_train, y_train)


Out[838]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [839]:
y_hat = model.predict(x_test)

Model/Feature Evaluation


In [840]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [841]:
accuracy_score(y_test, y_hat)


Out[841]:
0.79260898066207797

In [842]:
confusion_matrix(y_test, y_hat)


Out[842]:
array([[8456,  850],
       [1681, 1217]])