In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
In [2]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data", header=None, names=['age', 'workclass', 'fnlwgt',
'education-categorical', 'educ',
'marital-status', 'occupation',
'relationship', 'race', 'sex',
'captial-gain', 'capital-loss',
'hours', 'native-country',
'income'])
In [3]:
income = 1 * (data['income'] == " >50K")
In [1]:
income.value_counts()
In [3]:
import seaborn as seaborn
g = seaborn.pairplot(data)
In [4]:
logreg = linear_model.LogisticRegression(C=1e5)
age2 = np.square(data['age'])
data = data[['age', 'educ', 'hours']]
data['age2'] = age2
data['income'] = income
X = data[['age', 'age2', 'educ', 'hours']]
Y = data['income']
logreg.fit(X, Y)
Out[4]:
In [5]:
# check the accuracy on the training set
logreg.score(X, Y)
Out[5]:
In [6]:
Y.mean()
Out[6]:
So we've decent predictions but not great ones. Only 24% of the class earns more than 50k, which means that you could obtain 76% accuracy by always predicting "no". So we're doing better than the null error rate but not by much. Let's examine the coefficients and see what we learn.
In [7]:
g = np.transpose(logreg.coef_)
pd.DataFrame(list(zip(X.columns, g )))
Out[7]:
In [8]:
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
model2 = linear_model.LogisticRegression()
model2.fit(X_train, y_train)
Out[8]:
In [9]:
# predict class labels for the test set
predicted = model2.predict(X_test)
print(predicted)
In [10]:
# generate class probabilities
probs = model2.predict_proba(X_test)
print(probs)
In [ ]: