Downloaded from the UCI Machine Learning Repository on 13 November 2016. The dataset description is as follows:
This data set includes votes for each of the U.S. House of Representatives Congressmen on the 16 key votes identified by the CQA. The CQA lists nine different types of votes: voted for, paired for, and announced for (these three simplified to yea), voted against, paired against, and announced against (these three simplified to nay), voted present, voted present to avoid conflict of interest, and did not vote or otherwise make a position known (these three simplified to an unknown disposition).
Schlimmer, J. C. (1987). Concept acquisition through representational adjustment. Doctoral dissertation, Department of Information and Computer Science, University of California, Irvine, CA.
In [1]:
import dill
import json
import numpy as np
import os
import pandas as pd
import requests
import time
In [2]:
import matplotlib.pyplot as plt
from pandas.tools.plotting import parallel_coordinates, radviz
import seaborn as sns
In [3]:
from sklearn.cross_validation import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
In [4]:
%matplotlib inline
In [5]:
# Importing data from web
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data"
def fetch_data(fname='house-votes-84.data'):
"""
Helper method to retreive the ML Repository dataset.
"""
response = requests.get(URL)
outpath = os.path.abspath(fname)
with open(outpath, 'wb') as f:
f.write(response.content)
return outpath
# Fetch the data if required
DATA = fetch_data()
In [6]:
FEATURES = [
"class_name",
"handicapped_infants",
"water_project_cost_sharing",
"adoption_of_the_budget_resolution",
"physician_fee_freeze",
"el_salvador_aid",
"religious_groups_in_schools",
"anti_satellite_test_ban",
"aid_to_nicaraguan_contras",
"mx_missile",
"immigration",
"synfuels_corporation_cutback",
"education_spending",
"superfund_right_to_sue",
"crime",
"duty_free_exports",
"export_administration_act_south_africa"
]
In [7]:
# Read the data into a DataFrame
df = pd.read_csv(DATA, sep=',', header=None, names=FEATURES)
In [8]:
df.head()
Out[8]:
In [9]:
# Describe the dataset
print(df.describe())
In [10]:
# Unique value counts for each column
for i in df.columns:
print(df[i].value_counts())
In [11]:
# Dataset information
print(df.info())
In [12]:
# Check for missing values
print(df.isnull().sum())
In [13]:
df_2 = df.copy()
In [14]:
# Labelencoding
df_2['class_name'] = df_2['class_name'].map({'democrat':0,'republican':1})
for i in df_2.columns[1:]:
df_2[i] = df_2[i].map({'n': 0,'y': 1,'?':2})
In [15]:
df_2.head()
Out[15]:
In [16]:
# First pass for looking at frequency counts as a function of column.
for i in df_2.iloc[:,:]:
print(i)
plt.figure(1, figsize = (5,5), dpi = 80)
#histogram plot
plt.subplot(111)
plt.title("Histogram")
plt.hist(df_2.iloc[:,:][i])
plt.tight_layout()
plt.show()
In [17]:
# Pairplot
sns.pairplot(df_2)
Out[17]:
In [18]:
# Correlation heatmap
sns.heatmap(df_2.corr())
Out[18]:
In [19]:
# Parallel coordinates plot
plt.figure(figsize=(12,12))
plt.xticks(rotation='vertical')
parallel_coordinates(df_2, 'class_name')
plt.show()
In [20]:
# Radial plot
plt.figure(figsize=(12,12))
radviz(df_2, 'class_name')
plt.show()
In [21]:
df_3 = df_2.copy()
# Drop target column for test-train-split
df_3 = df_3.drop('class_name', axis=1)
In [22]:
# Test-train split. Learning curves not performed. Using 80/20% split.
X_train, X_test, y_train, y_test = train_test_split(df_3, df_2['class_name'], train_size=0.8,
random_state=1)
In [23]:
# Data not scaled, since total range of data is 0-2 and categorical.
clf = LogisticRegression()
In [24]:
# Initialize RFECV for feature selection
rfecv = RFECV(estimator=clf, step=1, cv=12, scoring='accuracy')
rfecv.fit(X_train, y_train)
print("Optimal number of features : %d" % rfecv.n_features_)
In [25]:
# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
In [26]:
# Print out table of sorted features by importance. Top features become the features used for ML.
print("Features sorted: ")
rfecv_ranking_df = pd.DataFrame({'feature':X_train.columns,
'importance':rfecv.ranking_})
rfecv_ranking_df_sorted = rfecv_ranking_df.sort_values(by = 'importance'
, ascending = True)
rfecv_ranking_df_sorted
Out[26]:
In [27]:
# Issues with subselecting the appropriate columns on the present test-train split. So
# re-performing test-train split for GridSearchCV.
df_4 = df_3[['adoption_of_the_budget_resolution','physician_fee_freeze','immigration',
'synfuels_corporation_cutback','education_spending']]
# Test-train split. Learning curves not performed. Using 80/20% split.
X_train, X_test, y_train, y_test = train_test_split(df_4, df_2['class_name'], train_size=0.8,
random_state=1)
In [28]:
# GridSearch for optimum parameters.
param_grid_pipeline = {'C':[0.0001,0.001,0.01,0.1,1.0,10,100],
'fit_intercept':[True,False],
'class_weight':['balanced',None],
'solver':['liblinear','newton-cg','lbfgs','sag']}
In [29]:
grid = GridSearchCV(clf, param_grid_pipeline, cv = 12, n_jobs = -1, verbose=1, scoring = 'accuracy')
In [30]:
grid.fit(X_train, y_train)
Out[30]:
In [31]:
grid.best_score_
Out[31]:
In [32]:
grid.best_estimator_.get_params()
Out[32]:
In [33]:
# Save model to disk
dill.dump(grid.best_estimator_, open('model_1984cvc_lr', 'wb'))
In [34]:
# Import model from disk
grid = dill.load(open('model_1984cvc_lr', 'rb'))
In [35]:
# Predicted target class
y_pred = grid.predict(X_test)
y_pred
Out[35]:
In [36]:
# Predicted target class probabilities
y_pred_proba = grid.predict_proba(X_test)
y_pred_proba
Out[36]:
In [37]:
y_pred_proba_democrat = y_pred_proba[:,0]
y_pred_proba_republican = y_pred_proba[:,1]
In [38]:
# Create dataframe of predicted values and probabilities for party affiliation
df_pred = pd.DataFrame({'class_name':y_test,
'class_name_predicted':y_pred,
'class_name_prob_democrat':y_pred_proba_democrat,
'class_name_prob_republican':y_pred_proba_republican})
df_pred.head()
Out[38]:
In [39]:
# Save test-based data to .csv to disk.
df_pred.to_csv('1984cvc_jhb.csv')
In [40]:
# Print classification report
target_names = ['Democrat', 'Republican']
clr = classification_report(y_test, y_pred, target_names=target_names)
print(clr)
In [41]:
# Print/plot confusion matrix
cm = np.array(confusion_matrix(y_test, y_pred, labels=[0,1]))
confusion = pd.DataFrame(cm, index=['Democrat', 'Republican'],
columns=['predicted_Democrat','predicted_Republican'])
confusion
Out[41]:
In [42]:
def plot_confusion_matrix(cm, title='Confusion Matrix', cmap=plt.cm.Blues):
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 15)
plt.colorbar()
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=0, size = 12)
plt.yticks(tick_marks, target_names, rotation=90, size = 12)
plt.tight_layout()
plt.ylabel('True Label', size = 15)
plt.xlabel('Predicted Label', size = 15)
plt.savefig('plot_confusion_matrix')
In [48]:
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)
print('Confusion matrix, without normalization')
print(cm)
plt.figure()
plot_confusion_matrix(cm)
In [44]:
# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print('Normalized Confusion Matrix')
print(cm_normalized)
plt.figure()
plot_confusion_matrix(cm_normalized, title='Normalized Confusion Matrix')
plt.savefig('plot_norm_confusion_matrix')
plt.show()
In [45]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
fpr[1], tpr[1], _ = roc_curve(y_test, y_pred_proba_republican)
roc_auc[1] = auc(fpr[1], tpr[1])
In [46]:
# Plot of ROC curve for a specific class
def roc_curve_single_class(fpr, tpr, roc_auc):
plt.figure()
plt.plot(fpr[1], tpr[1], label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', size = 15)
plt.ylabel('True Positive Rate', size = 15)
plt.xticks(size = 12)
plt.yticks(size = 12)
plt.title('Receiver Operating Characteristic (ROC)', size = 15)
plt.legend(loc="lower right")
plt.savefig('plot_roc_curve')
plt.show()
In [47]:
roc_curve_single_class(fpr, tpr, roc_auc)
In [ ]: