In [1]:
import pandas as pd
data_df = pd.read_csv('congressional_voting_scikit_label_dataset.csv')

In [2]:
import numpy as np
np.random.seed(12345)
%matplotlib inline
import seaborn as sns

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [3]:
data_df.head()


Out[3]:
handicapped_infants water_project_cost_sharing adoption_of_the_budget_resolution physician_fee_freeze el_salvador_aid religious_groups_in_schools anti_satellite_test_ban aid_to_nicaraguan_contras mx_missile immigration synfuels_corporation_cutback education_spending superfund_right_to_sue crime duty_free_exports export_administration_act_south_africa political_party
0 1.0 2.0 1.0 2.0 2.0 2.0 1.0 1.0 1.0 2.0 0.0 2.0 2.0 2.0 1.0 2.0 1.0
1 1.0 2.0 1.0 2.0 2.0 2.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 1.0 0.0 1.0
2 0.0 2.0 2.0 0.0 2.0 2.0 1.0 1.0 1.0 1.0 2.0 1.0 2.0 2.0 1.0 1.0 0.0
3 1.0 2.0 2.0 1.0 0.0 2.0 1.0 1.0 1.0 1.0 2.0 1.0 2.0 1.0 1.0 2.0 0.0
4 2.0 2.0 2.0 1.0 2.0 2.0 1.0 1.0 1.0 1.0 2.0 0.0 2.0 2.0 2.0 2.0 0.0

Class Distribution


In [4]:
counts = data_df.political_party.value_counts()
ax = counts.plot(kind='bar')


Feature Counts


In [5]:
columns = data_df.columns
gp = data_df.groupby('political_party')

for column in columns:
    if column != 'political_party':
         print(gp[column].value_counts())


political_party  handicapped_infants
0.0              2.0                    156
                 1.0                    102
                 0.0                      9
1.0              1.0                    134
                 2.0                     31
                 0.0                      3
Name: handicapped_infants, dtype: int64
political_party  water_project_cost_sharing
0.0              2.0                           120
                 1.0                           119
                 0.0                            28
1.0              2.0                            75
                 1.0                            73
                 0.0                            20
Name: water_project_cost_sharing, dtype: int64
political_party  adoption_of_the_budget_resolution
0.0              2.0                                  231
                 1.0                                   29
                 0.0                                    7
1.0              1.0                                  142
                 2.0                                   22
                 0.0                                    4
Name: adoption_of_the_budget_resolution, dtype: int64
political_party  physician_fee_freeze
0.0              1.0                     245
                 2.0                      14
                 0.0                       8
1.0              2.0                     163
                 0.0                       3
                 1.0                       2
Name: physician_fee_freeze, dtype: int64
political_party  el_salvador_aid
0.0              1.0                200
                 2.0                 55
                 0.0                 12
1.0              2.0                157
                 1.0                  8
                 0.0                  3
Name: el_salvador_aid, dtype: int64
political_party  religious_groups_in_schools
0.0              1.0                            135
                 2.0                            123
                 0.0                              9
1.0              2.0                            149
                 1.0                             17
                 0.0                              2
Name: religious_groups_in_schools, dtype: int64
political_party  anti_satellite_test_ban
0.0              2.0                        200
                 1.0                         59
                 0.0                          8
1.0              1.0                        123
                 2.0                         39
                 0.0                          6
Name: anti_satellite_test_ban, dtype: int64
political_party  aid_to_nicaraguan_contras
0.0              2.0                          218
                 1.0                           45
                 0.0                            4
1.0              1.0                          133
                 2.0                           24
                 0.0                           11
Name: aid_to_nicaraguan_contras, dtype: int64
political_party  mx_missile
0.0              2.0           188
                 1.0            60
                 0.0            19
1.0              1.0           146
                 2.0            19
                 0.0             3
Name: mx_missile, dtype: int64
political_party  immigration
0.0              1.0            139
                 2.0            124
                 0.0              4
1.0              2.0             92
                 1.0             73
                 0.0              3
Name: immigration, dtype: int64
political_party  synfuels_corporation_cutback
0.0              2.0                             129
                 1.0                             126
                 0.0                              12
1.0              1.0                             138
                 2.0                              21
                 0.0                               9
Name: synfuels_corporation_cutback, dtype: int64
political_party  education_spending
0.0              1.0                   213
                 2.0                    36
                 0.0                    18
1.0              2.0                   135
                 1.0                    20
                 0.0                    13
Name: education_spending, dtype: int64
political_party  superfund_right_to_sue
0.0              1.0                       179
                 2.0                        73
                 0.0                        15
1.0              2.0                       136
                 1.0                        22
                 0.0                        10
Name: superfund_right_to_sue, dtype: int64
political_party  crime
0.0              1.0      167
                 2.0       90
                 0.0       10
1.0              2.0      158
                 0.0        7
                 1.0        3
Name: crime, dtype: int64
political_party  duty_free_exports
0.0              2.0                  160
                 1.0                   91
                 0.0                   16
1.0              1.0                  142
                 2.0                   14
                 0.0                   12
Name: duty_free_exports, dtype: int64
political_party  export_administration_act_south_africa
0.0              2.0                                       173
                 0.0                                        82
                 1.0                                        12
1.0              2.0                                        96
                 1.0                                        50
                 0.0                                        22
Name: export_administration_act_south_africa, dtype: int64

Convert to 1-hot encoding


In [6]:
columns = data_df.columns
encoded_df = []

for column in columns:
    if column != 'political_party':
        cat_df = pd.get_dummies(data_df.loc[:, column])
        cols = ['{}_{}'.format(column, i) for i in range(cat_df.shape[1])]
        cat_df.columns = cols
        encoded_df.append(cat_df)
    
encoded_df = pd.concat(encoded_df, axis=1)
encoded_df.loc[:, 'political_party'] = data_df.political_party

In [7]:
encoded_df.head()


Out[7]:
handicapped_infants_0 handicapped_infants_1 handicapped_infants_2 water_project_cost_sharing_0 water_project_cost_sharing_1 water_project_cost_sharing_2 adoption_of_the_budget_resolution_0 adoption_of_the_budget_resolution_1 adoption_of_the_budget_resolution_2 physician_fee_freeze_0 ... crime_0 crime_1 crime_2 duty_free_exports_0 duty_free_exports_1 duty_free_exports_2 export_administration_act_south_africa_0 export_administration_act_south_africa_1 export_administration_act_south_africa_2 political_party
0 0 1 0 0 0 1 0 1 0 0 ... 0 0 1 0 1 0 0 0 1 1.0
1 0 1 0 0 0 1 0 1 0 0 ... 0 0 1 0 1 0 1 0 0 1.0
2 1 0 0 0 0 1 0 0 1 1 ... 0 0 1 0 1 0 0 1 0 0.0
3 0 1 0 0 0 1 0 0 1 0 ... 0 1 0 0 1 0 0 0 1 0.0
4 0 0 1 0 0 1 0 0 1 0 ... 0 0 1 0 0 1 0 0 1 0.0

5 rows × 49 columns

Model and Cross-Validation


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression


pipeline = Pipeline([
    ('selector', SelectKBest(k=6)),
    ('model', LogisticRegression(solver='lbfgs', verbose=1))
])

In [9]:
cv = StratifiedKFold(n_splits=5, random_state=12345)

scores = cross_validate(
    pipeline,
    encoded_df.drop('political_party', axis=1).values,
    encoded_df.political_party.values,
    cv=cv,
    scoring='f1',
    return_train_score=True,
)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished

In [10]:
scores = pd.DataFrame.from_dict(scores)

In [11]:
scores


Out[11]:
fit_time score_time test_score train_score
0 0.011379 0.000959 0.957746 0.941606
1 0.005397 0.000785 0.956522 0.942029
2 0.004702 0.000794 0.923077 0.938628
3 0.004542 0.000880 0.985075 0.935252
4 0.004067 0.000980 0.857143 0.952030