In [6]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
%matplotlib inline
In [7]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=['age', 'workclass', 'fnlwgt',
'education-categorical', 'educ',
'marital-status', 'occupation',
'relationship', 'race', 'sex',
'captial-gain', 'capital-loss',
'hours', 'native-country',
'income'])
In [10]:
data = data[~pd.isnull(data['income'])]
data[data['native-country']==" United-States"]
Out[10]:
In [11]:
income = 1 * (data['income'] == " >50K")
age2 = np.square(data['age'])
We'll restrict our search space a bit, to only those variables or features we think are useful.
In [12]:
data = data[['age', 'educ', 'hours']]
data['age2'] = age2
data['income'] = income
In [13]:
income.value_counts()
Out[13]:
In [14]:
import seaborn as seaborn
g = seaborn.pairplot(data)
In [31]:
logreg = linear_model.LogisticRegression(C=1e5)
age2 = np.square(data['age'])
data = data[['age', 'educ', 'hours']]
data['age2'] = age2
data['income'] = income
X = data[['age', 'age2', 'educ', 'hours']]
Y = data['income']
logreg.fit(X, Y)
Out[31]:
In [32]:
# check the accuracy on the training set
logreg.score(X, Y)
Out[32]:
In [33]:
Y.mean()
Out[33]:
So we've decent predictions but not great ones. Only 24% of the class earns more than 50k, which means that you could obtain 76% accuracy by always predicting "no". So we're doing better than the null error rate but not by much. Let's examine the coefficients and see what we learn.
In [34]:
g = np.transpose(logreg.coef_)
pd.DataFrame(list(zip(X.columns, g )))
Out[34]:
In [35]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
model2 = linear_model.LogisticRegression()
model2.fit(X_train, y_train)
Out[35]:
In [36]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print(predicted)
In [37]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print(probs)
In [ ]: