In [757]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
plt.rcParams["figure.figsize"] = [12, 9]
matplotlib.style.use('ggplot')
%matplotlib inline
In [758]:
def read_data(path):
return pd.read_csv(path,
index_col=False,
skipinitialspace=True,
names=['age', 'workclass', 'fnlwgt', 'education', 'education_num',
'marital_status', 'occupation', 'relationship', 'race', 'sex',
'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
'income']
)
In [759]:
train = read_data('./data/adult/adult.data')
test = read_data('./data/adult/adult.test')
In [760]:
train = train.append(test)
train.head()
Out[760]:
In [761]:
train.hist(figsize=(12, 9))
Out[761]:
age, education_num, hours_per_week, fnlwgt
seem like good candidates as features. Not much information in capital_gain, capital_loss
.
In [762]:
# for column in train.select_dtypes(['object']).columns:
# train[column] = train[column].astype('category')
## Check for duplicates, nulls
train.drop_duplicates(inplace=True)
train.dropna(inplace=True)
print any(train.duplicated())
print train.isnull().any()
Let's clean some data
In [763]:
train.income.loc[train.income == '>50K.'] = '>50K'
train.income.loc[train.income == '<=50K.'] = '<=50K'
train.income.value_counts()
Out[763]:
In [770]:
education_subset = train.groupby(['education_num', 'income']).size().reset_index()
education_subset.columns = ['education_num', 'income', 'count']
func = lambda x: float(x['count']) / train[train.education_num == x.education_num].count()[0]
education_subset['percentage'] = education_subset.apply(func, axis=1)
education_subset['education + income'] = education_subset.apply(lambda x: '%s, %s' % (x.education_num, x.income), axis=1)
education_subset.sort().plot(kind='barh', x='education + income', y='percentage', figsize=(12,12))
Out[770]:
Above plot shows percentage of population with respect to education and income, and it seems people with Masters and PhD tend to earn to more (more number of people are in >50K bucket).
In [786]:
train.groupby('income').hist(figsize=(15,12))
Out[786]:
First plot shows distribution of age with respect to income <= 50K. Age is used as an proxy to experience. Assumption here is people continue to work as they age and acquire more skills in the process. As per intuition, number of people making less than 50K decreases as per age.
Second plot shows income > 50K. More interestingly, data shows a peak around 45. This indicates either there aren't enough poeple of age 45+ earning more than 50K in the data or income decreases as people approach retirement.
In [810]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
In [817]:
lencoder = LabelEncoder()
oencoder = OneHotEncoder()
In [843]:
features = pd.DataFrame()
features['age'] = train['age']
features['education_num'] = train['education_num']
features['hours_per_week'] = train['hours_per_week']
features['fnlwgt'] = train['fnlwgt']
features['sex'] = lencoder.fit_transform(train.sex)
features['occupation'] = lencoder.fit_transform(train.occupation)
features.income = train.income
features.income = lencoder.fit_transform(features.income)
features.head()
Out[843]:
In [836]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
In [837]:
x_train, x_test, y_train, y_test = train_test_split(features.drop('income'), features.income)
In [838]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
Out[838]:
In [839]:
y_hat = model.predict(x_test)
In [840]:
from sklearn.metrics import confusion_matrix, accuracy_score
In [841]:
accuracy_score(y_test, y_hat)
Out[841]:
In [842]:
confusion_matrix(y_test, y_hat)
Out[842]: