Homework 2 - Logistic Regression Dataset

This is a clustering dataset for practicing logistic regression


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
# Basic IO
filename = 'log_reg/parkinsons.data'
dataset  = pd.read_csv(filename)

# Randomize Dataset
dataset = dataset.sample(frac=1,random_state=32).reset_index()
dataset.head()


Out[2]:
index name MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 27 phon_R01_S06_4 146.845 208.701 81.737 0.00496 0.000030 0.00250 0.00275 0.00749 ... 0.02650 0.01328 25.119 1 0.358773 0.726652 -6.271690 0.196102 2.314209 0.162999
1 141 phon_R01_S34_2 208.083 253.792 91.802 0.00757 0.000040 0.00428 0.00428 0.01285 ... 0.12047 0.04238 15.648 1 0.606344 0.665945 -5.410336 0.288917 2.665133 0.231723
2 140 phon_R01_S34_1 170.368 268.796 79.543 0.00571 0.000030 0.00232 0.00269 0.00696 ... 0.05139 0.02485 18.540 1 0.677131 0.685057 -4.796845 0.397749 2.963799 0.277227
3 34 phon_R01_S07_5 203.184 211.526 196.160 0.00178 0.000009 0.00094 0.00106 0.00283 ... 0.01403 0.00065 33.047 0 0.340068 0.741899 -7.964984 0.163519 1.423287 0.044539
4 57 phon_R01_S16_4 117.274 129.916 110.402 0.00752 0.000060 0.00299 0.00469 0.00898 ... 0.03568 0.00681 22.817 1 0.530529 0.817756 -4.608260 0.290024 2.021591 0.314464

5 rows × 25 columns


In [3]:
dataset.columns


Out[3]:
Index([u'index', u'name', u'MDVP:Fo(Hz)', u'MDVP:Fhi(Hz)', u'MDVP:Flo(Hz)',
       u'MDVP:Jitter(%)', u'MDVP:Jitter(Abs)', u'MDVP:RAP', u'MDVP:PPQ',
       u'Jitter:DDP', u'MDVP:Shimmer', u'MDVP:Shimmer(dB)', u'Shimmer:APQ3',
       u'Shimmer:APQ5', u'MDVP:APQ', u'Shimmer:DDA', u'NHR', u'HNR', u'status',
       u'RPDE', u'DFA', u'spread1', u'spread2', u'D2', u'PPE'],
      dtype='object')

In [4]:
dataset.drop(['index','name'], axis=1, inplace=True)
dataset.head()


Out[4]:
MDVP:Fo(Hz) MDVP:Fhi(Hz) MDVP:Flo(Hz) MDVP:Jitter(%) MDVP:Jitter(Abs) MDVP:RAP MDVP:PPQ Jitter:DDP MDVP:Shimmer MDVP:Shimmer(dB) ... Shimmer:DDA NHR HNR status RPDE DFA spread1 spread2 D2 PPE
0 146.845 208.701 81.737 0.00496 0.000030 0.00250 0.00275 0.00749 0.01919 0.198 ... 0.02650 0.01328 25.119 1 0.358773 0.726652 -6.271690 0.196102 2.314209 0.162999
1 208.083 253.792 91.802 0.00757 0.000040 0.00428 0.00428 0.01285 0.06725 0.571 ... 0.12047 0.04238 15.648 1 0.606344 0.665945 -5.410336 0.288917 2.665133 0.231723
2 170.368 268.796 79.543 0.00571 0.000030 0.00232 0.00269 0.00696 0.03273 0.281 ... 0.05139 0.02485 18.540 1 0.677131 0.685057 -4.796845 0.397749 2.963799 0.277227
3 203.184 211.526 196.160 0.00178 0.000009 0.00094 0.00106 0.00283 0.00958 0.085 ... 0.01403 0.00065 33.047 0 0.340068 0.741899 -7.964984 0.163519 1.423287 0.044539
4 117.274 129.916 110.402 0.00752 0.000060 0.00299 0.00469 0.00898 0.02293 0.221 ... 0.03568 0.00681 22.817 1 0.530529 0.817756 -4.608260 0.290024 2.021591 0.314464

5 rows × 23 columns


In [5]:
# Split Into Training & Testing Sets
train, test  = train_test_split(dataset,test_size=0.30)
#train, valid = train_test_split(train, test_size=0.20)

# Write to Text Data
train.to_csv('log_reg/parkinsons_train.csv',index=False)
#train.to_csv('log_reg/valid_parkinsons.csv',index=False)
test.to_csv('log_reg/parkinsons_test.csv',index=False)

In [6]:
print 'Number of Instances Per Set'
print 'Training Set:   %d'%(len(train))
#print 'Validation Set: %d'%(len(valid))
print 'Testing Set:    %d'%(len(test))


Number of Instances Per Set
Training Set:   136
Testing Set:    59