In [1]:

    
import pandas as pd
data_df = pd.read_csv('congressional_voting_scikit_label_dataset.csv')



In [2]:

    
import numpy as np
np.random.seed(12345)
%matplotlib inline
import seaborn as sns

import matplotlib.pyplot as plt
plt.style.use('ggplot')



In [3]:

    
data_df.head()









    Out[3]:







  
    
      
      handicapped_infants
      water_project_cost_sharing
      adoption_of_the_budget_resolution
      physician_fee_freeze
      el_salvador_aid
      religious_groups_in_schools
      anti_satellite_test_ban
      aid_to_nicaraguan_contras
      mx_missile
      immigration
      synfuels_corporation_cutback
      education_spending
      superfund_right_to_sue
      crime
      duty_free_exports
      export_administration_act_south_africa
      political_party
    
  
  
    
      0
      1.0
      2.0
      1.0
      2.0
      2.0
      2.0
      1.0
      1.0
      1.0
      2.0
      0.0
      2.0
      2.0
      2.0
      1.0
      2.0
      1.0
    
    
      1
      1.0
      2.0
      1.0
      2.0
      2.0
      2.0
      1.0
      1.0
      1.0
      1.0
      1.0
      2.0
      2.0
      2.0
      1.0
      0.0
      1.0
    
    
      2
      0.0
      2.0
      2.0
      0.0
      2.0
      2.0
      1.0
      1.0
      1.0
      1.0
      2.0
      1.0
      2.0
      2.0
      1.0
      1.0
      0.0
    
    
      3
      1.0
      2.0
      2.0
      1.0
      0.0
      2.0
      1.0
      1.0
      1.0
      1.0
      2.0
      1.0
      2.0
      1.0
      1.0
      2.0
      0.0
    
    
      4
      2.0
      2.0
      2.0
      1.0
      2.0
      2.0
      1.0
      1.0
      1.0
      1.0
      2.0
      0.0
      2.0
      2.0
      2.0
      2.0
      0.0

Class Distribution



In [4]:

    
counts = data_df.political_party.value_counts()
ax = counts.plot(kind='bar')

Feature Counts



In [5]:

    
columns = data_df.columns
gp = data_df.groupby('political_party')

for column in columns:
    if column != 'political_party':
         print(gp[column].value_counts())









    



political_party  handicapped_infants
0.0              2.0                    156
                 1.0                    102
                 0.0                      9
1.0              1.0                    134
                 2.0                     31
                 0.0                      3
Name: handicapped_infants, dtype: int64
political_party  water_project_cost_sharing
0.0              2.0                           120
                 1.0                           119
                 0.0                            28
1.0              2.0                            75
                 1.0                            73
                 0.0                            20
Name: water_project_cost_sharing, dtype: int64
political_party  adoption_of_the_budget_resolution
0.0              2.0                                  231
                 1.0                                   29
                 0.0                                    7
1.0              1.0                                  142
                 2.0                                   22
                 0.0                                    4
Name: adoption_of_the_budget_resolution, dtype: int64
political_party  physician_fee_freeze
0.0              1.0                     245
                 2.0                      14
                 0.0                       8
1.0              2.0                     163
                 0.0                       3
                 1.0                       2
Name: physician_fee_freeze, dtype: int64
political_party  el_salvador_aid
0.0              1.0                200
                 2.0                 55
                 0.0                 12
1.0              2.0                157
                 1.0                  8
                 0.0                  3
Name: el_salvador_aid, dtype: int64
political_party  religious_groups_in_schools
0.0              1.0                            135
                 2.0                            123
                 0.0                              9
1.0              2.0                            149
                 1.0                             17
                 0.0                              2
Name: religious_groups_in_schools, dtype: int64
political_party  anti_satellite_test_ban
0.0              2.0                        200
                 1.0                         59
                 0.0                          8
1.0              1.0                        123
                 2.0                         39
                 0.0                          6
Name: anti_satellite_test_ban, dtype: int64
political_party  aid_to_nicaraguan_contras
0.0              2.0                          218
                 1.0                           45
                 0.0                            4
1.0              1.0                          133
                 2.0                           24
                 0.0                           11
Name: aid_to_nicaraguan_contras, dtype: int64
political_party  mx_missile
0.0              2.0           188
                 1.0            60
                 0.0            19
1.0              1.0           146
                 2.0            19
                 0.0             3
Name: mx_missile, dtype: int64
political_party  immigration
0.0              1.0            139
                 2.0            124
                 0.0              4
1.0              2.0             92
                 1.0             73
                 0.0              3
Name: immigration, dtype: int64
political_party  synfuels_corporation_cutback
0.0              2.0                             129
                 1.0                             126
                 0.0                              12
1.0              1.0                             138
                 2.0                              21
                 0.0                               9
Name: synfuels_corporation_cutback, dtype: int64
political_party  education_spending
0.0              1.0                   213
                 2.0                    36
                 0.0                    18
1.0              2.0                   135
                 1.0                    20
                 0.0                    13
Name: education_spending, dtype: int64
political_party  superfund_right_to_sue
0.0              1.0                       179
                 2.0                        73
                 0.0                        15
1.0              2.0                       136
                 1.0                        22
                 0.0                        10
Name: superfund_right_to_sue, dtype: int64
political_party  crime
0.0              1.0      167
                 2.0       90
                 0.0       10
1.0              2.0      158
                 0.0        7
                 1.0        3
Name: crime, dtype: int64
political_party  duty_free_exports
0.0              2.0                  160
                 1.0                   91
                 0.0                   16
1.0              1.0                  142
                 2.0                   14
                 0.0                   12
Name: duty_free_exports, dtype: int64
political_party  export_administration_act_south_africa
0.0              2.0                                       173
                 0.0                                        82
                 1.0                                        12
1.0              2.0                                        96
                 1.0                                        50
                 0.0                                        22
Name: export_administration_act_south_africa, dtype: int64

Convert to 1-hot encoding



In [6]:

    
columns = data_df.columns
encoded_df = []

for column in columns:
    if column != 'political_party':
        cat_df = pd.get_dummies(data_df.loc[:, column])
        cols = ['{}_{}'.format(column, i) for i in range(cat_df.shape[1])]
        cat_df.columns = cols
        encoded_df.append(cat_df)
    
encoded_df = pd.concat(encoded_df, axis=1)
encoded_df.loc[:, 'political_party'] = data_df.political_party



In [7]:

    
encoded_df.head()









    Out[7]:







  
    
      
      handicapped_infants_0
      handicapped_infants_1
      handicapped_infants_2
      water_project_cost_sharing_0
      water_project_cost_sharing_1
      water_project_cost_sharing_2
      adoption_of_the_budget_resolution_0
      adoption_of_the_budget_resolution_1
      adoption_of_the_budget_resolution_2
      physician_fee_freeze_0
      ...
      crime_0
      crime_1
      crime_2
      duty_free_exports_0
      duty_free_exports_1
      duty_free_exports_2
      export_administration_act_south_africa_0
      export_administration_act_south_africa_1
      export_administration_act_south_africa_2
      political_party
    
  
  
    
      0
      0
      1
      0
      0
      0
      1
      0
      1
      0
      0
      ...
      0
      0
      1
      0
      1
      0
      0
      0
      1
      1.0
    
    
      1
      0
      1
      0
      0
      0
      1
      0
      1
      0
      0
      ...
      0
      0
      1
      0
      1
      0
      1
      0
      0
      1.0
    
    
      2
      1
      0
      0
      0
      0
      1
      0
      0
      1
      1
      ...
      0
      0
      1
      0
      1
      0
      0
      1
      0
      0.0
    
    
      3
      0
      1
      0
      0
      0
      1
      0
      0
      1
      0
      ...
      0
      1
      0
      0
      1
      0
      0
      0
      1
      0.0
    
    
      4
      0
      0
      1
      0
      0
      1
      0
      0
      1
      0
      ...
      0
      0
      1
      0
      0
      1
      0
      0
      1
      0.0
    
  

5 rows × 49 columns

Model and Cross-Validation



In [8]:

    
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import LogisticRegression


pipeline = Pipeline([
    ('selector', SelectKBest(k=6)),
    ('model', LogisticRegression(solver='lbfgs', verbose=1))
])



In [9]:

    
cv = StratifiedKFold(n_splits=5, random_state=12345)

scores = cross_validate(
    pipeline,
    encoded_df.drop('political_party', axis=1).values,
    encoded_df.political_party.values,
    cv=cv,
    scoring='f1',
    return_train_score=True,
)









    



[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished



In [10]:

    
scores = pd.DataFrame.from_dict(scores)



In [11]:

    
scores









    Out[11]:







  
    
      
      fit_time
      score_time
      test_score
      train_score
    
  
  
    
      0
      0.011379
      0.000959
      0.957746
      0.941606
    
    
      1
      0.005397
      0.000785
      0.956522
      0.942029
    
    
      2
      0.004702
      0.000794
      0.923077
      0.938628
    
    
      3
      0.004542
      0.000880
      0.985075
      0.935252
    
    
      4
      0.004067
      0.000980
      0.857143
      0.952030

	handicapped_infants	water_project_cost_sharing	adoption_of_the_budget_resolution	physician_fee_freeze	el_salvador_aid	religious_groups_in_schools	anti_satellite_test_ban	aid_to_nicaraguan_contras	mx_missile	immigration	synfuels_corporation_cutback	education_spending	superfund_right_to_sue	crime	duty_free_exports	export_administration_act_south_africa	political_party
0	1.0	2.0	1.0	2.0	2.0	2.0	1.0	1.0	1.0	2.0	0.0	2.0	2.0	2.0	1.0	2.0	1.0
1	1.0	2.0	1.0	2.0	2.0	2.0	1.0	1.0	1.0	1.0	1.0	2.0	2.0	2.0	1.0	0.0	1.0
2	0.0	2.0	2.0	0.0	2.0	2.0	1.0	1.0	1.0	1.0	2.0	1.0	2.0	2.0	1.0	1.0	0.0
3	1.0	2.0	2.0	1.0	0.0	2.0	1.0	1.0	1.0	1.0	2.0	1.0	2.0	1.0	1.0	2.0	0.0
4	2.0	2.0	2.0	1.0	2.0	2.0	1.0	1.0	1.0	1.0	2.0	0.0	2.0	2.0	2.0	2.0	0.0

	handicapped_infants_0	handicapped_infants_1	handicapped_infants_2	water_project_cost_sharing_2	adoption_of_the_budget_resolution_1	adoption_of_the_budget_resolution_2	physician_fee_freeze_0	...	crime_1	crime_2	duty_free_exports_1	duty_free_exports_2	export_administration_act_south_africa_0	export_administration_act_south_africa_1	export_administration_act_south_africa_2	political_party
0	0	1	0	1	1	0	0	...	0	1	1	0	0	0	1	1.0
1	0	1	0	1	1	0	0	...	0	1	1	0	1	0	0	1.0
2	1	0	0	1	0	1	1	...	0	1	1	0	0	1	0	0.0
3	0	1	0	1	0	1	0	...	1	0	1	0	0	0	1	0.0
4	0	0	1	1	0	1	0	...	0	1	0	1	0	0	1	0.0

	fit_time	score_time	test_score	train_score
0	0.011379	0.000959	0.957746	0.941606
1	0.005397	0.000785	0.956522	0.942029
2	0.004702	0.000794	0.923077	0.938628
3	0.004542	0.000880	0.985075	0.935252
4	0.004067	0.000980	0.857143	0.952030

	handicapped_infants	water_project_cost_sharing	adoption_of_the_budget_resolution	physician_fee_freeze	el_salvador_aid	religious_groups_in_schools	anti_satellite_test_ban	aid_to_nicaraguan_contras	mx_missile	immigration	synfuels_corporation_cutback	education_spending	superfund_right_to_sue	crime	duty_free_exports	export_administration_act_south_africa	political_party
0	1.0	2.0	1.0	2.0	2.0	2.0	1.0	1.0	1.0	2.0	0.0	2.0	2.0	2.0	1.0	2.0	1.0
1	1.0	2.0	1.0	2.0	2.0	2.0	1.0	1.0	1.0	1.0	1.0	2.0	2.0	2.0	1.0	0.0	1.0
2	0.0	2.0	2.0	0.0	2.0	2.0	1.0	1.0	1.0	1.0	2.0	1.0	2.0	2.0	1.0	1.0	0.0
3	1.0	2.0	2.0	1.0	0.0	2.0	1.0	1.0	1.0	1.0	2.0	1.0	2.0	1.0	1.0	2.0	0.0
4	2.0	2.0	2.0	1.0	2.0	2.0	1.0	1.0	1.0	1.0	2.0	0.0	2.0	2.0	2.0	2.0	0.0

	handicapped_infants_0	handicapped_infants_1	handicapped_infants_2	water_project_cost_sharing_2	adoption_of_the_budget_resolution_1	adoption_of_the_budget_resolution_2	physician_fee_freeze_0	...	crime_1	crime_2	duty_free_exports_1	duty_free_exports_2	export_administration_act_south_africa_0	export_administration_act_south_africa_1	export_administration_act_south_africa_2	political_party
0	0	1	0	1	1	0	0	...	0	1	1	0	0	0	1	1.0
1	0	1	0	1	1	0	0	...	0	1	1	0	1	0	0	1.0
2	1	0	0	1	0	1	1	...	0	1	1	0	0	1	0	0.0
3	0	1	0	1	0	1	0	...	1	0	1	0	0	0	1	0.0
4	0	0	1	1	0	1	0	...	0	1	0	1	0	0	1	0.0

	handicapped_infants	water_project_cost_sharing	adoption_of_the_budget_resolution	physician_fee_freeze	el_salvador_aid	religious_groups_in_schools	anti_satellite_test_ban	aid_to_nicaraguan_contras	mx_missile	immigration	synfuels_corporation_cutback	education_spending	superfund_right_to_sue	crime	duty_free_exports	export_administration_act_south_africa	political_party
0	1.0	2.0	1.0	2.0	2.0	2.0	1.0	1.0	1.0	2.0	0.0	2.0	2.0	2.0	1.0	2.0	1.0
1	1.0	2.0	1.0	2.0	2.0	2.0	1.0	1.0	1.0	1.0	1.0	2.0	2.0	2.0	1.0	0.0	1.0
2	0.0	2.0	2.0	0.0	2.0	2.0	1.0	1.0	1.0	1.0	2.0	1.0	2.0	2.0	1.0	1.0	0.0
3	1.0	2.0	2.0	1.0	0.0	2.0	1.0	1.0	1.0	1.0	2.0	1.0	2.0	1.0	1.0	2.0	0.0
4	2.0	2.0	2.0	1.0	2.0	2.0	1.0	1.0	1.0	1.0	2.0	0.0	2.0	2.0	2.0	2.0	0.0

	handicapped_infants_0	handicapped_infants_1	handicapped_infants_2	water_project_cost_sharing_2	adoption_of_the_budget_resolution_1	adoption_of_the_budget_resolution_2	physician_fee_freeze_0	...	crime_1	crime_2	duty_free_exports_1	duty_free_exports_2	export_administration_act_south_africa_0	export_administration_act_south_africa_1	export_administration_act_south_africa_2	political_party
0	0	1	0	1	1	0	0	...	0	1	1	0	0	0	1	1.0
1	0	1	0	1	1	0	0	...	0	1	1	0	1	0	0	1.0
2	1	0	0	1	0	1	1	...	0	1	1	0	0	1	0	0.0
3	0	1	0	1	0	1	0	...	1	0	1	0	0	0	1	0.0
4	0	0	1	1	0	1	0	...	0	1	0	1	0	0	1	0.0