Party words!



In [1]:

    
import pandas as pd
import numpy as np



In [2]:

    
two_party_words = pd.read_csv("../data/two.csv")
two_party_words.head()









    Out[2]:






  
    
      
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      district_x
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      0
      0
      Rep
      Neil
      NaN
      Abercrombie
      NaN
      NaN
      D
      HI
      1
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      5
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      2
      Rep
      Robert
      B.
      Aderholt
      NaN
      NaN
      R
      AL
      4
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      3
      Sen
      Daniel
      Kahikina
      Akaka
      NaN
      NaN
      D
      HI
      Junior Seat
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      4
      Sen
      Wayne
      A.
      Allard
      NaN
      NaN
      R
      CO
      Senior Seat
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 14450 columns



In [3]:

    
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy import stats

#make it so that we only show first 4 decimals for floats
np.set_printoptions(precision=4,suppress=True)

# visualization
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt



In [4]:

    
party_dummies = pd.get_dummies(two_party_words.party_x).astype(int)
party_dummies = party_dummies[["R"]]
party_dummies.head()
capitol_words = party_dummies.merge(two_party_words, right_index=True, left_index=True)

#del capitol_words['Unnamed: 0']
capitol_words.head()









    Out[4]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      0
      0
      0
      Rep
      Neil
      NaN
      Abercrombie
      NaN
      NaN
      D
      HI
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      1
      2
      Rep
      Robert
      B.
      Aderholt
      NaN
      NaN
      R
      AL
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0
      3
      Sen
      Daniel
      Kahikina
      Akaka
      NaN
      NaN
      D
      HI
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      1
      4
      Sen
      Wayne
      A.
      Allard
      NaN
      NaN
      R
      CO
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 14451 columns

Histgram for words

Must of my histograms will look in this razy way because I'm delaing with a sparse matrix. So a lot of this words are not mentioned a lot. There is not one word that is mentioned consitently by more than 40% percent of the people as their top word.



In [5]:

    
sns.pairplot(capitol_words[["zimbabwe","zinc"]])









    Out[5]:





<seaborn.axisgrid.PairGrid at 0x11a110950>



In [6]:

    
word_columns = capitol_words.columns[807:]
capitol_words[word_columns]
capitol_words.R.head()
X_words = capitol_words[word_columns]
y_words = capitol_words["R"]
X_train,X_test,y_train,y_test = train_test_split(X_words,y_words,test_size=0.4)

from sklearn.tree import DecisionTreeClassifier
words_tree = DecisionTreeClassifier(max_depth=3, random_state=1)
words_tree.fit(X_train, y_train)









    Out[6]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')



In [7]:

    
words_tree.feature_importances_
features = pd.DataFrame({'feature':word_columns, 'importance':words_tree.feature_importances_}).sort_values(by='importance',ascending=False)



In [8]:

    
features.head()



In [9]:

    
print capitol_words.columns[15:]
capitol_words.crp_id
capitol_words.ix[:,:25].head()









    



Index([u'website_x', u'webform', u'congress_office', u'bioguide_id',
       u'votesmart_id', u'fec_id', u'govtrack_id', u'crp_id', u'twitter_id',
       u'congresspedia_url',
       ...
       u'ziegler', u'zimbabwe', u'zimmer', u'zinc', u'zion', u'zoberman',
       u'zone', u'zones', u'zoo', u'zuni'],
      dtype='object', length=14436)






    Out[9]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      website_x
      webform
      congress_office
      bioguide_id
      votesmart_id
      fec_id
      govtrack_id
      crp_id
      twitter_id
      congresspedia_url
    
  
  
    
      0
      0
      0
      Rep
      Neil
      NaN
      Abercrombie
      NaN
      NaN
      D
      HI
      ...
      http://www.house.gov/abercrombie
      NaN
      NaN
      A000014
      26827.0
      H6HI01121
      400001
      N00007665
      neilabercrombie
      http://www.opencongress.org/wiki/Neil_Abercrombie
    
    
      1
      0
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      ...
      http://ackerman.house.gov/
      http://www.house.gov/writerep
      2111 Rayburn House Office Building
      A000022
      26970.0
      H4NY07011
      400003
      N00001143
      repgaryackerman
      http://www.opencongress.org/wiki/Gary_Ackerman
    
    
      2
      1
      2
      Rep
      Robert
      B.
      Aderholt
      NaN
      NaN
      R
      AL
      ...
      https://aderholt.house.gov
      http://aderholt.house.gov/email-me2/
      235 Cannon House Office Building
      A000055
      441.0
      H6AL04098
      400004
      N00003028
      Robert_Aderholt
      http://www.opencongress.org/wiki/Robert_Aderholt
    
    
      3
      0
      3
      Sen
      Daniel
      Kahikina
      Akaka
      NaN
      NaN
      D
      HI
      ...
      http://akaka.senate.gov
      http://www.akaka.senate.gov/email-senator-akak...
      141 Hart Senate Office Building
      A000069
      53286.0
      S0HI00084
      300001
      N00007653
      NaN
      http://www.opencongress.org/wiki/Daniel_Akaka
    
    
      4
      1
      4
      Sen
      Wayne
      A.
      Allard
      NaN
      NaN
      R
      CO
      ...
      http://allard.senate.gov
      NaN
      NaN
      A000109
      26783.0
      S6CO00168
      300003
      N00009082
      NaN
      http://www.opencongress.org/wiki/Wayne_Allard
    
  

5 rows × 25 columns

Find method:

Small mask method to make my life easier:



In [10]:

    
def my_mask(df,column,condition,value):
    new_data = []
    if condition == "==":
        new_data = df[df[column] == value]
    elif condition == "<=":
        new_data = df[df[column] <= value]
    elif condition == "!=":
        new_data = df[df[column] != value]
    elif condition == ">=":
        new_data = df[df[column] >= value]
    elif condition == ">":
        new_data = df[df[column] > value]
    elif condition == "<":
        new_data = df[df[column] < value]
    else:
        print "arguments needed-column,condition,value-:"
    return new_data



In [11]:

    
my_first_mask = my_mask(capitol_words,"firstname","==","Neil")



In [12]:

    
def subset(df,column):
    dict = {}
    subs = df[column].unique()  
    for element in subs:
         dict[element] = my_mask(df,column,"==",element)
    print "New available dictionary of dataframes is:\n subsets_of ",subs 
    return dict



In [13]:

    
states = subset(capitol_words,"state_x")
states['AK'].head()
parties = subset(capitol_words, "party_x")
parties['D'].head()









    



New available dictionary of dataframes is:
 subsets_of  ['HI' 'NY' 'AL' 'CO' 'NJ' 'ME' 'MO' 'TN' 'LA' 'PA' 'OH' 'FL' 'MI' 'NH' 'NC'
 'MD' 'TX' 'MT' 'CA' 'UT' 'AR' 'DE' 'NM' 'GA' 'OR' 'IA' 'VA' 'KS' 'KY' 'IN'
 'WV' 'WA' 'WI' 'NV' 'IL' 'SC' 'GU' 'OK' 'MN' 'WY' 'AK' 'MA' 'ND' 'CT' 'VI'
 'MS' 'ID' 'RI' 'AS' 'AZ' 'NE' 'PR' 'SD' 'VT' 'DC']
New available dictionary of dataframes is:
 subsets_of  ['D' 'R']






    Out[13]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      0
      0
      0
      Rep
      Neil
      NaN
      Abercrombie
      NaN
      NaN
      D
      HI
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0
      3
      Sen
      Daniel
      Kahikina
      Akaka
      NaN
      NaN
      D
      HI
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      5
      0
      5
      Rep
      Robert
      E.
      Andrews
      NaN
      Rob
      D
      NJ
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      6
      0
      6
      Rep
      Thomas
      H.
      Allen
      NaN
      Tom
      D
      ME
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 14451 columns



In [14]:

    
str(my_first_mask)

globals()[capitol_words.state_x.unique()[0]+"lala"] = {}
print capitol_words.state_x.unique()[1]
globals()['variable{}'.format(capitol_words.state_x.unique()[1])] = 0

NY



In [15]:

    
def clean_sparse_irrelevant(pd):
    cols = pd.columns
    deleted=0
    for c in cols:
        x=pd[c]
        if x.dtype=="float64":
            if x.sum()==0:
                del pd[c]
                deleted += 1
    print "DELETED:",deleted
    return pd



In [16]:

    
clean_sparse_irrelevant(states['AK'])









    



DELETED: 14135






    Out[16]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      veterans
      village
      villages
      water
      whaling
      wilderness
      wildlife
      wind
      yeas
      youth
    
  
  
    
      82
      0
      86
      Sen
      Mark
      NaN
      Begich
      NaN
      NaN
      D
      AK
      ...
      0.051424
      0.091710
      0.124813
      0.000000
      0.000000
      0.000000
      0.074392
      0.000000
      0.000000
      0.000000
    
    
      472
      1
      559
      Sen
      Lisa
      A.
      Murkowski
      NaN
      NaN
      R
      AK
      ...
      0.000000
      0.095305
      0.129706
      0.068622
      0.000000
      0.085929
      0.077309
      0.085929
      0.000000
      0.090092
    
    
      628
      1
      750
      Sen
      Ted
      F.
      Stevens
      NaN
      NaN
      R
      AK
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.089775
      0.080769
      0.000000
      0.106864
      0.000000
    
    
      753
      1
      889
      Rep
      Don
      E.
      Young
      NaN
      NaN
      R
      AK
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.124897
      0.082743
      0.074442
      0.000000
      0.000000
      0.000000
    
  

4 rows × 316 columns



In [17]:

    
sns.pairplot(states['AK'][["veterans","wildlife","wilderness","villages"]])









    Out[17]:





<seaborn.axisgrid.PairGrid at 0x142cef610>



In [18]:

    
word_columns = states['AK'].columns
word_columns[40:]









    Out[18]:





Index([u'202', u'46', u'48', u'a.m.', u'absence', u'acres', u'agreed', u'ak',
       u'alaska', u'alaska's',
       ...
       u'veterans', u'village', u'villages', u'water', u'whaling',
       u'wilderness', u'wildlife', u'wind', u'yeas', u'youth'],
      dtype='object', length=276)



In [19]:

    
y = word_columns[35:][0]
x = "..."
def reporter(x): 
    try:
        return float(x)
    except ValueError:
        return "no"
print reporter(x)
reporter(y)
def word_finder(list,start):
    for index, element in enumerate(list, start):
        if element[0]!="a":
            pass
        else:
            first = index
            break
    return first
x = word_columns[29:].tolist()
print word_finder(x,29)
word_columns[word_finder(x,29):]









    



no
43






    Out[19]:





Index([u'a.m.', u'absence', u'acres', u'agreed', u'ak', u'alaska', u'alaska's',
       u'alaskan', u'alaskans', u'alcohol',
       ...
       u'veterans', u'village', u'villages', u'water', u'whaling',
       u'wilderness', u'wildlife', u'wind', u'yeas', u'youth'],
      dtype='object', length=273)



In [20]:

    
my_mask(capitol_words,"whaling",">",0)









    Out[20]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      266
      0
      312
      Rep
      Charles
      A.
      Gonzalez
      NaN
      Charlie
      D
      TX
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      560
      0
      672
      Rep
      Nick
      J.
      Rahall
      II
      NaN
      D
      WV
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      753
      1
      889
      Rep
      Don
      E.
      Young
      NaN
      NaN
      R
      AK
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

3 rows × 14451 columns

Standarization

with sklearn.preprocessing package

CENTERING SPARSE DATA... not!

centering sparse data would destroy the sparseness structure in the data, but MaxAbsScaler and maxabs_scale were specifically designed for scaling sparse data, specially if the features are in different scales. scale and StandardScaler can accept scipy.sparse matrices as input, as long as with_centering=False More about this



In [21]:

    
alaska = states["AK"]
alaska









    Out[21]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      veterans
      village
      villages
      water
      whaling
      wilderness
      wildlife
      wind
      yeas
      youth
    
  
  
    
      82
      0
      86
      Sen
      Mark
      NaN
      Begich
      NaN
      NaN
      D
      AK
      ...
      0.051424
      0.091710
      0.124813
      0.000000
      0.000000
      0.000000
      0.074392
      0.000000
      0.000000
      0.000000
    
    
      472
      1
      559
      Sen
      Lisa
      A.
      Murkowski
      NaN
      NaN
      R
      AK
      ...
      0.000000
      0.095305
      0.129706
      0.068622
      0.000000
      0.085929
      0.077309
      0.085929
      0.000000
      0.090092
    
    
      628
      1
      750
      Sen
      Ted
      F.
      Stevens
      NaN
      NaN
      R
      AK
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.089775
      0.080769
      0.000000
      0.106864
      0.000000
    
    
      753
      1
      889
      Rep
      Don
      E.
      Young
      NaN
      NaN
      R
      AK
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.124897
      0.082743
      0.074442
      0.000000
      0.000000
      0.000000
    
  

4 rows × 316 columns

Small example on Alaska:

I will normalize on one small subset of my data just to see what the results would be, how the values would change.



In [22]:

    
from sklearn.preprocessing import maxabs_scale
print maxabs_scale(alaska.ix[:,43:], axis=0, copy=False)
alaska.ix[:,43:] = maxabs_scale(alaska.ix[:,43:], axis=0, copy=False)
print alaska.ix[:,43:].head()









    



[[ 0.921   0.      0.     ...,  0.      0.      0.    ]
 [ 0.      0.      0.9572 ...,  1.      0.      1.    ]
 [ 1.      1.      1.     ...,  0.      1.      0.    ]
 [ 0.      0.      0.9217 ...,  0.      0.      0.    ]]






    



/Library/Python/2.7/site-packages/pandas-0.18.0-py2.7-macosx-10.11-intel.egg/pandas/core/indexing.py:461: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s






    



         a.m.  absence     acres  agreed        ak    alaska  alaska's  \
82   0.921049      0.0  0.000000     0.0  0.962272  0.921049  0.921049   
472  0.000000      0.0  0.957161     0.0  1.000000  0.957161  0.957161   
628  1.000000      1.0  1.000000     1.0  0.000000  1.000000  1.000000   
753  0.000000      0.0  0.921669     0.0  0.962920  0.921669  0.921669   

      alaskan  alaskans  alcohol  ...    veterans   village  villages  water  \
82   0.921049  0.921049      0.0  ...         1.0  0.962272  0.962272    0.0   
472  0.957161  0.957161      1.0  ...         0.0  1.000000  1.000000    1.0   
628  1.000000  1.000000      0.0  ...         0.0  0.000000  0.000000    0.0   
753  0.921669  0.921669      0.0  ...         0.0  0.000000  0.000000    0.0   

     whaling  wilderness  wildlife  wind  yeas  youth  
82       0.0    0.000000  0.921049   0.0   0.0    0.0  
472      0.0    0.957161  0.957161   1.0   0.0    1.0  
628      0.0    1.000000  1.000000   0.0   1.0    0.0  
753      1.0    0.921669  0.921669   0.0   0.0    0.0  

[4 rows x 273 columns]

Now the data is scaled without loosing the sparse structure:



In [23]:

    
alaska









    Out[23]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      veterans
      village
      villages
      water
      whaling
      wilderness
      wildlife
      wind
      yeas
      youth
    
  
  
    
      82
      0
      86
      Sen
      Mark
      NaN
      Begich
      NaN
      NaN
      D
      AK
      ...
      1.0
      0.962272
      0.962272
      0.0
      0.0
      0.000000
      0.921049
      0.0
      0.0
      0.0
    
    
      472
      1
      559
      Sen
      Lisa
      A.
      Murkowski
      NaN
      NaN
      R
      AK
      ...
      0.0
      1.000000
      1.000000
      1.0
      0.0
      0.957161
      0.957161
      1.0
      0.0
      1.0
    
    
      628
      1
      750
      Sen
      Ted
      F.
      Stevens
      NaN
      NaN
      R
      AK
      ...
      0.0
      0.000000
      0.000000
      0.0
      0.0
      1.000000
      1.000000
      0.0
      1.0
      0.0
    
    
      753
      1
      889
      Rep
      Don
      E.
      Young
      NaN
      NaN
      R
      AK
      ...
      0.0
      0.000000
      0.000000
      0.0
      1.0
      0.921669
      0.921669
      0.0
      0.0
      0.0
    
  

4 rows × 316 columns

Word Normalization with small method:

I decided to build this a small method, it doesn't have any game changing logic to it, I just want to save myself some lines of code: What this method will do is select a part of the DataFrama fron the end until the words sparse matrix ends and another or until the numbers start.



In [24]:

    
#select the words from the data frame: the words should be the last part of it. [ex:]
#index, where the data frame becomes the sparse matrix
#word_finder(dataFrame)
#from sklearn.preprocessing import maxabs_scale
#scaler = preprocessing.StandardScaler().fit(X)
def word_maxabsscaler(dataFrame,index):
    dataFrame.ix[:,word_finder(dataFrame,index):] = maxabs_scale(dataFrame.ix[:,word_finder(dataFrame,index):], axis=0, copy=False)



In [25]:

    
NY = states["NY"]#.ix[:,30:]
NY.head()









    Out[25]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      1
      0
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      11
      0
      11
      Rep
      Michael
      A.
      Arcuri
      NaN
      NaN
      D
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      61
      0
      65
      Rep
      Timothy
      H.
      Bishop
      NaN
      NaN
      D
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      93
      1
      97
      Rep
      Ann Marie
      NaN
      Buerkle
      NaN
      NaN
      R
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      124
      0
      146
      Rep
      Joseph
      NaN
      Crowley
      NaN
      Joe
      D
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 14451 columns



In [26]:

    
clean_sparse_irrelevant(NY)
word_maxabsscaler(NY,30)









    



DELETED: 11764



In [27]:

    
word_maxabsscaler(capitol_words,30)
clean_sparse_irrelevant(capitol_words)
capitol_words.head()









    



DELETED: 55






    Out[27]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      0
      0
      0
      Rep
      Neil
      NaN
      Abercrombie
      NaN
      NaN
      D
      HI
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      1
      2
      Rep
      Robert
      B.
      Aderholt
      NaN
      NaN
      R
      AL
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0
      3
      Sen
      Daniel
      Kahikina
      Akaka
      NaN
      NaN
      D
      HI
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      1
      4
      Sen
      Wayne
      A.
      Allard
      NaN
      NaN
      R
      CO
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 14396 columns



In [28]:

    
print "Commented method crashed when the dictionary is too long"
# def subset_sparse_reg(frameDict,  ):
#     for key in states:
#         clean_sparse_irrelevant(frameDict[key])
#         word_maxabsscaler(frameDict[key],30)
#         print key
#         try:
#             frameDict[key]["villages"][0]
#         except KeyError:
#             print "no villages in ", key
# subset_sparse_reg(states)









    



Commented method crashed when the dictionary is too long



In [29]:

    
len(states)









    Out[29]:





55

Feature Decomposition and Dimensionality Reduction

Just to compare result between these models in this particular data set. This is an example of when it's a good idea to reduce the number of columns in the data set. There are more than 14 000 columns (it was the resut of getting the words that were said the must as dummies and then getting the td-idf count of them) So too many columns are being used to predict the target variable, that is Republican or Democrat. One of the risks of these techniques is overfitting the model



In [30]:

    
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, train_test_split
import scipy.stats as stats

# visualization
%matplotlib inline
import seaborn as sns



In [31]:

    
capitol_words.head()









    Out[31]:






  
    
      
      R
      Unnamed: 0
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      0
      0
      0
      Rep
      Neil
      NaN
      Abercrombie
      NaN
      NaN
      D
      HI
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      1
      2
      Rep
      Robert
      B.
      Aderholt
      NaN
      NaN
      R
      AL
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0
      3
      Sen
      Daniel
      Kahikina
      Akaka
      NaN
      NaN
      D
      HI
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      1
      4
      Sen
      Wayne
      A.
      Allard
      NaN
      NaN
      R
      CO
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 14396 columns

where words start being useful

After normalization, some words had a global weight that was very small in a td-idf matrix count, so their column.sum() was cero, I will not feed that to my model because a colum n filled with 0 will not add much variance in a spacer matrix. Also at index 30 is where I the sparse matrix got attached to the original data set.



In [32]:

    
print "where the words start with a column sum that is different to 0, some words are said so little that their td-idf count could be 0, index position", word_finder(capitol_words,30)
capitol_words.ix[:,836:].head()









    



where the words start with a column sum that is different to 0, some words are said so little that their td-idf count could be 0, index position 837






    Out[32]:






  
    
      
      aberdeen
      abernathy
      abilene
      abilities
      ability
      abilityone
      abington
      able
      able-bodied
      abm
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 13560 columns



In [33]:

    
global_correlations = capitol_words.ix[:,836:].corr()
global_correlations.head()









    Out[33]:






  
    
      
      aberdeen
      abernathy
      abilene
      abilities
      ability
      abilityone
      abington
      able
      able-bodied
      abm
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      aberdeen
      1.000000
      -0.002277
      -0.002277
      -0.002277
      -0.002277
      -0.003948
      -0.002277
      -0.006016
      -0.002277
      -0.002277
      ...
      -0.002277
      -0.003223
      -0.002277
      -0.002277
      -0.002277
      -0.003222
      -0.003211
      -0.002277
      -0.003221
      -0.002277
    
    
      abernathy
      -0.002277
      1.000000
      -0.001318
      -0.001318
      -0.001318
      -0.002284
      -0.001318
      -0.003480
      -0.001318
      -0.001318
      ...
      -0.001318
      -0.001864
      -0.001318
      -0.001318
      -0.001318
      -0.001864
      -0.001858
      -0.001318
      -0.001864
      -0.001318
    
    
      abilene
      -0.002277
      -0.001318
      1.000000
      -0.001318
      -0.001318
      -0.002284
      -0.001318
      -0.003480
      -0.001318
      -0.001318
      ...
      -0.001318
      -0.001864
      -0.001318
      -0.001318
      -0.001318
      -0.001864
      -0.001858
      -0.001318
      -0.001864
      -0.001318
    
    
      abilities
      -0.002277
      -0.001318
      -0.001318
      1.000000
      -0.001318
      -0.002284
      -0.001318
      -0.003480
      -0.001318
      -0.001318
      ...
      -0.001318
      -0.001864
      -0.001318
      -0.001318
      -0.001318
      -0.001864
      -0.001858
      -0.001318
      -0.001864
      -0.001318
    
    
      ability
      -0.002277
      -0.001318
      -0.001318
      -0.001318
      1.000000
      -0.002284
      -0.001318
      0.447158
      -0.001318
      -0.001318
      ...
      -0.001318
      -0.001864
      -0.001318
      -0.001318
      -0.001318
      -0.001864
      -0.001858
      -0.001318
      -0.001864
      -0.001318
    
  

5 rows × 13560 columns



In [34]:

    
sns.plt.figure(figsize=(24,20))
sns.heatmap(capitol_words.ix[:,836:].transpose().corr().values)









    Out[34]:





<matplotlib.axes._subplots.AxesSubplot at 0x134fb2c50>

we can see that some variables are very correlated, this will cause a lot of trouble doing the PCA because some words are highly correlated with one another, some because they are synonims other because they are used in very similar contexts, so far I expect it to return an ell-conditioned matrix of Principal Components. This is done only to have a look or some type of description on the columns.



In [35]:

    
pca = PCA()
transformed_pca_x = pca.fit_transform(capitol_words.ix[:,836:])

component_names = ["component_"+str(comp) for comp in range(1, len(pca.explained_variance_)+1)]

#generate new component dataframe
transformed_pca_x = pd.DataFrame(transformed_pca_x,columns=component_names)
print "CCOMPONENT MATRIX:"
transformed_pca_x.head()









    



CCOMPONENT MATRIX:






    Out[35]:






  
    
      
      component_1
      component_2
      component_3
      component_4
      component_5
      component_6
      component_7
      component_8
      component_9
      component_10
      ...
      component_751
      component_752
      component_753
      component_754
      component_755
      component_756
      component_757
      component_758
      component_759
      component_760
    
  
  
    
      0
      -0.220044
      -0.449690
      0.100950
      -0.832581
      0.219606
      -0.467335
      0.956750
      0.545296
      -0.340359
      0.119844
      ...
      0.113568
      -0.006057
      0.008048
      -0.122832
      0.109533
      -0.053620
      0.052741
      -0.047402
      0.015401
      1.515012e-14
    
    
      1
      0.138437
      -1.602499
      1.161407
      -1.129002
      -2.914940
      0.627202
      1.168848
      0.004197
      2.617375
      -0.281435
      ...
      -0.284056
      0.077545
      0.038633
      0.064460
      0.108532
      -0.164170
      -0.016991
      -0.094365
      0.011284
      1.515012e-14
    
    
      2
      -0.867362
      -0.680678
      -0.810129
      -0.662719
      0.040655
      -0.542018
      -0.225643
      0.616752
      -0.661218
      -0.966489
      ...
      0.083775
      0.066818
      -0.059395
      -0.055435
      0.075082
      0.000091
      0.005481
      -0.035931
      -0.012672
      1.515012e-14
    
    
      3
      1.314171
      0.209269
      0.522750
      -1.274981
      1.279825
      0.816029
      0.246104
      0.276451
      -0.277542
      0.198625
      ...
      0.091743
      -0.084369
      0.061380
      -0.245269
      0.239200
      0.019961
      -0.087226
      -0.034727
      0.002394
      1.515012e-14
    
    
      4
      2.537292
      1.965719
      -0.582027
      -2.657319
      -0.062524
      -0.268063
      0.407101
      0.573592
      0.360400
      0.506837
      ...
      -0.000553
      0.154048
      0.180420
      -0.328523
      -0.074689
      0.193437
      0.066049
      0.004390
      0.016395
      1.515012e-14
    
  

5 rows × 760 columns



In [36]:

    
#generate component loadings on original features
component_matrix = pd.DataFrame(pca.components_,index=component_names)

Too slow:

The following plots are too slow to run and must likely the would be impossible to read, this would be useful code if there were less features.



In [37]:

    
#add additional columns to describe what
# component_matrix["explained_variance_ratio"] = pca.explained_variance_ratio_
#component_matrix["eigenvalue"] = pca.explained_variance_
# figure = sns.plt.figure(figsize=(18,6))



In [38]:

    
#add 3 subplots one at a time

#first the component matrix
# figure.add_subplot(131)
# sns.heatmap(component_matrix.ix[:,:-2])

# #then the eigenvalues
# figure.add_subplot(132)
# sns.plt.plot(range(1,component_matrix.shape[0]+1), component_matrix.eigenvalue)
# sns.plt.xlabel("component number")
# sns.plt.ylabel("variance explained")

# #then the explained variance ratio
# figure.add_subplot(133)
# sns.plt.plot(range(1,component_matrix.shape[0]+1), component_matrix.explained_variance_ratio)
# sns.plt.xlabel("component number")
# sns.plt.ylabel("eigenvalue")

Component Matrix:

The reslt is not the easiest for intepretation

The problem with this is that PCA expects features with little to no correlation, and in this case, with words if I were to build a model that was based on eliminating similar words or correlated words, this would only acomplish the task of being overfitted and it would not do well at all for predicting a real example. Let's say one of the components was based on the word "small" and "small" is correlated with "little" but I just deleted little. Unless I have another way to capture semantic similarity I can't get rid of those words just yet.



In [39]:

    
component_matrix.head()









    Out[39]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      13550
      13551
      13552
      13553
      13554
      13555
      13556
      13557
      13558
      13559
    
  
  
    
      component_1
      0.000083
      -0.000145
      0.000077
      -0.001005
      -0.000916
      -0.002106
      0.000636
      0.003440
      -0.002305
      0.001842
      ...
      -0.000971
      0.000044
      -0.000971
      -0.000554
      -0.000541
      -0.000807
      0.000856
      0.000722
      -0.001109
      -0.000554
    
    
      component_2
      -0.000089
      -0.001345
      -0.000270
      -0.000259
      0.000031
      -0.001987
      -0.000477
      0.003825
      0.002468
      0.001834
      ...
      -0.000574
      -0.003957
      -0.000574
      -0.000301
      -0.000934
      -0.002164
      -0.000004
      0.000885
      -0.002601
      -0.000512
    
    
      component_3
      -0.000134
      0.001348
      -0.001406
      -0.000852
      -0.000445
      -0.002034
      0.000626
      -0.000247
      0.001749
      -0.000668
      ...
      -0.001044
      0.003421
      -0.001044
      -0.000885
      -0.000143
      -0.000165
      -0.001209
      -0.000763
      0.000741
      -0.000859
    
    
      component_4
      -0.000015
      0.000519
      0.001321
      0.000276
      -0.000438
      0.000543
      0.001994
      0.003334
      -0.000366
      -0.003675
      ...
      -0.000069
      -0.002044
      -0.000069
      0.000547
      -0.001042
      -0.000828
      -0.001653
      0.001370
      0.000042
      -0.001903
    
    
      component_5
      0.001052
      -0.000505
      -0.001242
      0.000327
      0.000413
      0.001026
      -0.000658
      0.002251
      -0.000059
      0.003534
      ...
      0.000134
      -0.007154
      0.000134
      -0.001067
      -0.000988
      -0.001702
      -0.001243
      0.000470
      0.001377
      0.002673
    
  

5 rows × 13560 columns

Logistic Regression

First try Logistic Regresssion, and comparison with the failed PC.



In [158]:

    
X = transformed_pca_x.ix[:,:500]
y = capitol_words["R"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

lr = LogisticRegression(C=1e9, penalty='l1')
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)

print "Test set accuracy of LR model: ",metrics.accuracy_score(y_test, y_test_pred)









    



Test set accuracy of LR model:  0.69298245614

Conclusion: Low benefit

Now with all of the features.



In [52]:

    
from sklearn.cross_validation import KFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn import metrics
import scipy.stats as stats



In [118]:

    
X = capitol_words[capitol_words.columns.tolist()[836:]]
y = capitol_words["R"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

lr = LogisticRegression(C=1e9, penalty='l2')
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)

print "Test set accuracy of LR model: ",metrics.accuracy_score(y_test, y_test_pred)









    



Test set accuracy of LR model:  0.894736842105

the prediction on data never seen before it got this



In [119]:

    
# null accuracy?
# compute null accuracy manually
print "Null accuracy on the test set: ",y_test.mean()









    



Null accuracy on the test set:  0.548245614035

this is the fraction of samples in the test set that.... So we look on the test set and we see it counts. Even with multiple categories, you'd still have to look at the count of the categories and pick the one that was must frequent. in this case the data set is pretty balance, because there are almost as many republicans and democrats. So this regression is capturing some things. This is the null accuracy, it's capturing something, or some kind of information. This would be a dummie classifier that would just be showing the must frequent, it will always pick the category that is must frequent. it will alwasys pick what ever category is the most over represented. This would be also useful for a multiclass regression



In [120]:

    
from sklearn.dummy import DummyClassifier
dumb_model = DummyClassifier(strategy='most_frequent')
dumb_model.fit(X_train, y_train)
y_dumb_class = dumb_model.predict(X_test)
print "Most frequent class dummy classifier test accuracy: ",metrics.accuracy_score(y_test, y_dumb_class)









    



Most frequent class dummy classifier test accuracy:  0.548245614035

Cross val score

we would have to see if the means are statiscally different as long as the standard deviation is not crazy huge, if it is then we might not be able to say much you want the green bars to be on the rigth and the gray things on the left so there would be not so much variance but on average across many fold the mean of the folds.... The cross validated scores vary per folds,



In [122]:

    
dumb_model = DummyClassifier(strategy='most_frequent')
dummy_scores = cross_val_score(dumb_model, X, y, cv=30)
real_scores = cross_val_score(LogisticRegression(),X , y,cv=30)
sns.plt.hist(dummy_scores)
sns.plt.hist(real_scores)
#we could use a cv=Startifield Kfold for when you have really unbalanced
#real_scores = cross_val_score(LogisticRegression(),X , y,cv=30)
print np.mean(dummy_scores)
print np.mean(real_scores)
print np.std(real_scores)









    



0.517185185185
0.908360873694
0.0563920964819

we don;t know what we are classifying wrong accuracy is difficiult when you have unbalanced models, it really depends of what categories you are capturing right and which one you are capturing bad, the unrepresented categories and such.... In terms of classification we can decompose the error and the hits in a confusion matrix

Positives and Negatives

True positives, true negatives and bleh... bad things happen too. False Negative and False Positive, if I have a classifier, it will make error, but when it does the error fall in either false negative and true negative. A false negative is awful. (like if you have cancer) it's called a miss, it depends on the context. A false alart There are multiple ways of talking about i. hit correct, rejection, miss false alarm. what we want to do is maximized the missclasification, what accurary tells you is just the fraction in the matrix. (Maybe you want to maximize some over other, maybe missing people that have cancer Unbalanced case... False alarm is? Maybe we want more possitives to capture the one ones that are more positive? what is left to do is tunnign the thereshold of the classifier. This is a measure of how accurate the model is. True negative rate in this case is much higher than the true positive rate.

Confusion Matrix

False negative rates, can have a secondary effect on the accuracy, i haven't decided which one I want to maximize. The diagonal is class 0 and class1. In this case it's only a 2x2 matrix, the diagonal is what I want to maximize. so class 2 and class 1 misspredictited are not so large. The is not an obvious visual difference in the confusion matrix



In [123]:

    
# confusion matrix
cm = metrics.confusion_matrix(y_test, y_test_pred)
print cm
sns.heatmap(cm)









    



[[ 89  14]
 [ 10 115]]






    Out[123]:





<matplotlib.axes._subplots.AxesSubplot at 0x1326c0690>

What fraction of the positive labels in unseen data did we correctly called positive? the TPR Specificity. the true negative rate. What fraction of the class 0s did we get. This metrics are run on the test set. What I'd rreally want to do is compute across all of the crossvalidation folds, take the average of the unseen data. TODO! Precision, when the model says republican, how many times whas it actually a republican, it's a metric based on the model/ Presicion is (PPV) how more can you trust it. True positive rate is pretty good. I captured 92% of the actual republicans. Recall is higher than chance. This means that everytime this classfier runs it's correrct on saying republican and it is republican is 92%. the PPV is 89% it's the believability of the classifier, when it says republicna what fraction of the times it's actually republican. Precision is almost on the order of 90% So I can be very certain it is republican text.

the 2 numbers that are more importatn precision and recall. recall is how believable are you.. The F1 score combines precision and recall.



In [124]:

    
# calculate each metric by hand
print "Sensitivity/Recall (TPR): ",cm[1,1] / float(cm[1,1] + cm[1,0])
print "Specificity (TNR): ", cm[0,0] / float(cm[0,0] + cm[0,1])
print "Precision (PPV): ", cm[1,1] / float(cm[1,1]+cm[0,1])
print "NPV: ", cm[0,0] / float(cm[0,0]+cm[1,0])
print "Accuracy: ", (cm[1,1]+cm[0,0]) / float(cm.sum())
print "F1:", metrics.f1_score(y_test,y_test_pred)









    



Sensitivity/Recall (TPR):  0.92
Specificity (TNR):  0.864077669903
Precision (PPV):  0.891472868217
NPV:  0.89898989899
Accuracy:  0.894736842105
F1: 0.905511811024



In [125]:

    
# calculate some of these metrics using sklearn and the test set samples
print "Sensitivity/Recall (TPR): ",metrics.recall_score(y_test,y_test_pred)
print "Precision (PPV): ", metrics.precision_score(y_test,y_test_pred)
print "Accuracy: ", metrics.accuracy_score(y_test,y_test_pred)
print "F1:", metrics.f1_score(y_test,y_test_pred)









    



Sensitivity/Recall (TPR):  0.92
Precision (PPV):  0.891472868217
Accuracy:  0.894736842105
F1: 0.905511811024



In [126]:

    
print "Classification Report:\n", metrics.classification_report(y_test,y_test_pred)









    



Classification Report:
             precision    recall  f1-score   support

          0       0.90      0.86      0.88       103
          1       0.89      0.92      0.91       125

avg / total       0.89      0.89      0.89       228

The classifier is setting thersfolf at .5 so any probability higher than .5 as a the default probability thersjolf if we alter it we would get a different precission. out of the box it tries to maximize thershold, we can sistematically look at what happens are we vary the thhereshold in every possible levl.



In [128]:

    
#lr probabilities per category for first five samples
predicted_probs_lr = lr.predict_proba(X_test).round(3)
predictions_lr = lr.predict(X_test)

print "Logistic Regression predicted probabilities for first five samples in test set:\n",predicted_probs_lr[:5]
print "Logistic Regression predictions for first five samples in test set:\n",predictions_lr[:5]
y_test_lr_df = pd.DataFrame(
    np.concatenate((
        predicted_probs_lr,predictions_lr.reshape((predictions_lr.shape[0],-1)),
        y_test.reshape((y_test.shape[0],-1))),axis=1
    ),
    columns = ["class_0","class_1","predicted","actual"])

y_test_lr_df.head()









    



Logistic Regression predicted probabilities for first five samples in test set:
[[ 0.236  0.764]
 [ 0.935  0.065]
 [ 0.041  0.959]
 [ 0.956  0.044]
 [ 0.894  0.106]]
Logistic Regression predictions for first five samples in test set:
[1 0 1 0 0]






    Out[128]:






  
    
      
      class_0
      class_1
      predicted
      actual
    
  
  
    
      0
      0.236
      0.764
      1.0
      0.0
    
    
      1
      0.935
      0.065
      0.0
      0.0
    
    
      2
      0.041
      0.959
      1.0
      1.0
    
    
      3
      0.956
      0.044
      0.0
      0.0
    
    
      4
      0.894
      0.106
      0.0
      0.0

There are guys I still need to investigate, there are some case were the misclasification is not of probability right at chance, In a portion of cases it gets things run, to these cases need to be flags, this need to be seen as in manually reviewed. the bulk of them could still be correct, those cases need to be reviewed. We can look at whihc one of those are in the test sets. Isolate the examplest:

There could be some other dimension that we are not capturing, this could be mislabeled or this guy just acts like a republican... not sure yet.



In [133]:

    
bad_y_class_0 = y_test_lr_df[np.logical_and(y_test_lr_df.class_0>.9, y_test_lr_df.actual==1.0)]
print bad_y_class_0
bad_y_class_1 = y_test_lr_df[np.logical_and(y_test_lr_df.class_1>.9, y_test_lr_df.actual==0.0)]
print bad_y_class_1









    



     class_0  class_1  predicted  actual
15     0.971    0.029        0.0     1.0
40     0.993    0.007        0.0     1.0
43     0.943    0.057        0.0     1.0
122    0.908    0.092        0.0     1.0
     class_0  class_1  predicted  actual
80     0.049    0.951        1.0     0.0
124    0.040    0.960        1.0     0.0
131    0.053    0.947        1.0     0.0
189    0.092    0.908        1.0     0.0



In [ ]:

Random Forest



In [78]:

    
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)

predicted_probs_rf = rf.predict_proba(X_test)
predictions_rf = rf.predict(X_test)

y_test_rf_df = pd.DataFrame(
    np.concatenate((
        predicted_probs_rf,predictions_rf.reshape((predictions_rf.shape[0],-1)),
        y_test.reshape((y_test.shape[0],-1))),axis=1
    ),
    columns = ["class_0","class_1","predicted","actual"])

y_test_rf_df.head()

Class probability changes The random forest did not help. its even stronger thinking the opposite case. if you want no false positive rates is .78 as the thereshold, the 2 curves

IN ROC curve what it does is illustrate the performance of the binary classfier and systematically change the thershold



In [134]:

    
#generate lr model false positive and true positive rates
fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(y_test, predicted_probs_lr[:,1])

#generate same for random forest model
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(y_test, predicted_probs_rf[:,1])

# plot LR and RF model ROC curves
sns.plt.plot(fpr_lr, tpr_lr,label="lr")
sns.plt.plot(fpr_rf, tpr_rf,label="rf")
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.legend(loc="lower right")
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity)')









    Out[134]:





<matplotlib.text.Text at 0x132999b10>

The area below the curve is the auc this will tell you how good your classifier is regarless of the thershold so logistic regression is better. If the classifier was bad it would be a line, that would be a flip of the coin, it's also a way to compare classifier agains eachohter, so in the context of this problem with the parameters that I used the logistic regression is actually better than a random forest. at all the false positive rates. The AUC is very high. Logistic regression always outperform the random forest



In [135]:

    
# calculate AUC for lr and rf
print "LR model AUC: ",metrics.roc_auc_score(y_test, predicted_probs_lr[:,1])
print "RF model AUC: ",metrics.roc_auc_score(y_test, predicted_probs_rf[:,1])









    



LR model AUC:  0.966019417476
RF model AUC:  0.904776699029

Probabilities are green and ROC is blue, on the x axis in both case we have the false postive anbd tge y is true positv at a false positive rate of .1 you There are not as many drops nad the curve looks very smooth There are not many gaps in the data.



In [136]:

    
# plot LR and RF model ROC curves
sns.plt.plot(fpr_lr, tpr_lr,label="lr")
sns.plt.plot(fpr_lr,thresholds_lr, label="lr_thresh")
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.legend(loc="center")
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity) or Class 1 Threshold Probability')









    Out[136]:





<matplotlib.text.Text at 0x132a75790>

at a false positive rate of 12% we are going to have a classififer that is stilas good the thereshold need to be a little over 7 it's an exache of accuracy for significantly fewer false positives, it should be base on what I decide to care about later. you have to provide the probabilities not the class labels.



In [144]:

    
y_test_lr_df["predicted_075"] = (y_test_lr_df.class_1 > 0.72).astype(float)
print y_test_lr_df.head()
print "Confusion matrix at original 0.5 threshold:\n",metrics.confusion_matrix(y_test_lr_df.actual,
                                                                      y_test_lr_df.predicted),"\n"
print "Classification Report at original 0.5 threshold:\n", metrics.classification_report(y_test_lr_df.actual,
                                                                                          y_test_lr_df.predicted),"\n"
print "Confusion matrix at 0.72 threshold:\n",metrics.confusion_matrix(y_test_lr_df.actual,
                                                                      y_test_lr_df.predicted_07),"\n"
print "Classification Report at 0.72 threshold:\n", metrics.classification_report(y_test_lr_df.actual,
                                                                                 y_test_lr_df.predicted_07)









    



   class_0  class_1  predicted  actual  predicted_07  predicted_075
0    0.236    0.764        1.0     0.0           1.0            1.0
1    0.935    0.065        0.0     0.0           0.0            0.0
2    0.041    0.959        1.0     1.0           1.0            1.0
3    0.956    0.044        0.0     0.0           0.0            0.0
4    0.894    0.106        0.0     0.0           0.0            0.0
Confusion matrix at original 0.5 threshold:
[[ 89  14]
 [ 10 115]] 

Classification Report at original 0.5 threshold:
             precision    recall  f1-score   support

        0.0       0.90      0.86      0.88       103
        1.0       0.89      0.92      0.91       125

avg / total       0.89      0.89      0.89       228


Confusion matrix at 0.72 threshold:
[[ 92  11]
 [ 17 108]] 

Classification Report at 0.72 threshold:
             precision    recall  f1-score   support

        0.0       0.84      0.89      0.87       103
        1.0       0.91      0.86      0.89       125

avg / total       0.88      0.88      0.88       228



In [138]:

    
# calculate AUC using y_pred_class (producing incorrect results)
print "Wrong way to calculate LR model AUC: ",metrics.roc_auc_score(y_test, predictions_lr)
print "Wrong way to calculate RF model AUC: ",metrics.roc_auc_score(y_test, predictions_rf)









    



Wrong way to calculate LR model AUC:  0.892038834951
Wrong way to calculate RF model AUC:  0.810058252427

Histogram grouped by the actual categories. Things in between are what is missclasified and you want it as flat as possible. the 2 classfier have different distributions so they get things wrong in different ways. so the top is logistic regression and the bottom is random forest.



In [139]:

    
# histogram of predicted probabilities grouped by actual response value for LR
y_test_lr_df.class_1.hist(by= y_test_lr_df.actual, sharex=True, sharey=True)
#same for RF
y_test_rf_df.class_1.hist(by= y_test_rf_df.actual, sharex=True, sharey=True)









    Out[139]:





array([<matplotlib.axes._subplots.AxesSubplot object at 0x132d95f90>,
       <matplotlib.axes._subplots.AxesSubplot object at 0x132e11890>], dtype=object)



In [ ]:

Label encoder

The roc does not change if you double the probabilities, the relative probability of the samples should not change. it cares about the ranking.



In [140]:

    
#convert outcome into binary 0/1 attribute
le = LabelEncoder()
#create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
#create logistic regression object
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)

print "Test set accuracy of default 0.5 threshold LR model: ",metrics.accuracy_score(y_test, y_test_pred)









    



Test set accuracy of default 0.5 threshold LR model:  0.890350877193



In [141]:

    
# calculate predicted probabilities for class 1
y_pred_prob1 = lr.predict_proba(X_test)[:, 1]
# show predicted probabilities in a histogram
sns.plt.hist(y_pred_prob1)









    Out[141]:





(array([ 48.,  16.,  14.,   9.,  17.,   8.,  15.,  19.,  20.,  62.]),
 array([ 0.0004,  0.1002,  0.1999,  0.2996,  0.3994,  0.4991,  0.5989,
         0.6986,  0.7984,  0.8981,  0.9979]),
 <a list of 10 Patch objects>)



In [145]:

    
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob1)









    Out[145]:





0.96831067961165052



In [146]:

    
# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob1)
sns.plt.plot(fpr, tpr)
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity)')









    Out[146]:





<matplotlib.text.Text at 0x1331e9b90>

First Demonstration: Take the square root of predicted probabilities (makes them all bigger, but preserve the order of probabilities)

probabilities are only probabilities in the context of the data base. You are making a transformation so that it has a certain order, so that some guys have you are ordering the set. the first guy maybe is the must republican. it is not really a probability. The guys are never going to be democrats they are just ordered in certain way.



In [ ]:



In [147]:

    
# change the predicted probabilities
y_pred_prob2 = np.sqrt(y_pred_prob1)
# here are the old ones (y_pred_prob1)
print "Old predicted probs:\n",y_pred_prob1[:10].round(3)









    



Old predicted probs:
[ 0.658  0.289  0.738  0.297  0.299  0.949  0.193  0.002  0.881  0.971]



In [148]:

    
# here are the new ones (y_pred_prob2)
print "New predicted probs:\n",y_pred_prob2[:10].round(3)









    



New predicted probs:
[ 0.811  0.538  0.859  0.545  0.547  0.974  0.439  0.04   0.939  0.985]



In [149]:

    
# you can see the histogram changed
figure = sns.plt.figure(figsize=(12,8))
figure.add_subplot(121)
sns.plt.title("Original histogram of predicted probabilities")
sns.plt.hist(y_pred_prob1)
figure.add_subplot(122)
sns.plt.title("Histogram of square root predicted probabilities")
sns.plt.hist(y_pred_prob2)









    Out[149]:





(array([ 12.,  19.,  17.,  11.,  11.,  15.,  19.,  15.,  28.,  81.]),
 array([ 0.0202,  0.1181,  0.2159,  0.3138,  0.4117,  0.5096,  0.6074,
         0.7053,  0.8032,  0.9011,  0.9989]),
 <a list of 10 Patch objects>)



In [150]:

    
# the AUC did not change
print "Old AUC: ",metrics.roc_auc_score(y_test, y_pred_prob1)
print "New AUC: ",metrics.roc_auc_score(y_test, y_pred_prob2)









    



Old AUC:  0.968310679612
New AUC:  0.968310679612



In [151]:

    
# the ROC curve did not change
fpr2, tpr2, thresholds2 = metrics.roc_curve(y_test, y_pred_prob2)
figure = sns.plt.figure(figsize=(12,8))
figure.add_subplot(121)
sns.plt.plot(fpr, tpr)
sns.plt.title("Original ROC Curve")
figure.add_subplot(122)
sns.plt.title("ROC Curve of sqrt probabilities")
sns.plt.plot(fpr2, tpr2)









    Out[151]:





[<matplotlib.lines.Line2D at 0x1326a10d0>]

Pipeline



In [154]:

    
#create rf regressor and check 10-fold RMSE
lr = LogisticRegression(C=1e9, penalty='l1')
cross_val_scores = np.abs(cross_val_score(lr,X,y,scoring = "mean_squared_error", cv=10))
rmse_cross_val_scores = map(np.sqrt, cross_val_scores)
print "Mean 10-fold rmse: ", np.mean(rmse_cross_val_scores)
print "Std 10-fold rmse: ", np.std(rmse_cross_val_scores)









    



Mean 10-fold rmse:  0.45237942083
Std 10-fold rmse:  0.0411547292556

Cross validation just allows you to split your data how ever you want

This is for splitting the data base, the categorical should be first and the numerical last



In [155]:

    
from sklearn.base import BaseEstimator, TransformerMixin

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to sklearn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

Categorical variables need to absolutetly be encoded, they have to be applied only to the categorical columns in this case I need to week it. label encoder is a transformer it takes string and makes them in to number



In [ ]:

    
# from sklearn.pipeline import FeatureUnion, Pipeline
# from sklearn.preprocessing import OneHotEncoder

# #encode the categorical column from strings to ints
# le = LabelEncoder()
# abalone_data["sex_encoded"] = abalone_data[[categorical_columns]].apply(le.fit_transform)

# #extract the y
# y = abalone_data.age

# #create the feature union for the features
# X_transformed_pipe = FeatureUnion(
#         transformer_list=[
#             # Pipeline for one hot encoding categorical column
#             ('sexes', Pipeline([
#                 ('selector', ItemSelector(key=["sex_encoded"])),
#                 ('encoder', OneHotEncoder())                    
#             ])),
#             # Pipeline for pulling out numeric features and scaling them
#             ('numeric', Pipeline([
#                 ('selector', ItemSelector(key=numeric_columns)),
#                 #('polyfeatures', PolynomialFeatures(degree=2,interaction_only=True)),
#                 ('scaler', StandardScaler()),
#             ]))])
# #create the full final pipeline
# full_pipeline = Pipeline([("all_features",X_transformed_pipe),("rf_regressor",RandomForestRegressor(n_estimators=100))])

a pipleine is a list of transformation, every list is a list of tuple, every thing is a tuple, that defines two things, it has a string name for your transformation and then the actual transformation you want to do, the transformaton you want to do can the themselves pipelines, if you have 2 pipelines and you want to combine their results toggether, you have to use a feature union that is another class that takes the list of pipelines that you want to put together. we have the original matrix we are going to take out the numerical and categorical and they recive 2 actions, the numeric gets sacles, categorical gets to be anumber, the first stage is to select the columns. in the abov e example sexes. its 2 pipelines. when you encode it, it's like pd.dummies. we are going to move to the numeric pipeline One hot encoder is just implemented in scikit learn, because it automatically takes off the last column. They themselvestransformer list takes a list that is a list of pipelines. the feature union gets put in to another pipeline. the final pipelnie is first tstage is runnign through the fist pipleine and the second stage is running the random forest regressor and telling it how many trees at the end yo pass on the crazy pipeline n to crossval. then we can feed it our data.

Another more involved case.

in the kidney data set. label encoder inscikit lear converts things in to class 0, the 0 label always correspons to nana, missing categorical values imputer the values with a mputer class, we tell them the missing values and te strategy is most frequent in imputer you can also pass it your own fancy function. transformer object a new pipeline where the selector would be key first set select the column you care about second thing is the column you are creatwing and then

every pipeline has a parameter called step and wen you call them it lists them. when you call them it gives you the index. you can access the steps like and array "[0]"[1]" It can get very nested. everything has it's sequence that has another sequnce.

At the very end you would pickle the pipeline and the grid search so you would have a basci script that would.

	feature	importance
10506	requesting	0.392907
6575	iraq	0.317179
12183	taxes	0.157311
247	african	0.132603
0	?	0.000000

	Unnamed: 0	title_x	firstname	middlename	lastname	name_suffix	nickname	party_x	state_x	district_x	...
0	0	Rep	Neil	NaN	Abercrombie	NaN	NaN	D	HI	1	...
1	1	Rep	Gary	L.	Ackerman	NaN	NaN	D	NY	5	...
2	2	Rep	Robert	B.	Aderholt	NaN	NaN	R	AL	4	...
3	3	Sen	Daniel	Kahikina	Akaka	NaN	NaN	D	HI	Junior Seat	...
4	4	Sen	Wayne	A.	Allard	NaN	NaN	R	CO	Senior Seat	...

	R	Unnamed: 0	title_x	firstname	middlename	lastname	name_suffix	nickname	party_x	state_x	...	website_x	webform	congress_office	bioguide_id	votesmart_id	fec_id	govtrack_id	crp_id	twitter_id	congresspedia_url
0	0	0	Rep	Neil	NaN	Abercrombie	NaN	NaN	D	HI	...	http://www.house.gov/abercrombie	NaN	NaN	A000014	26827.0	H6HI01121	400001	N00007665	neilabercrombie	http://www.opencongress.org/wiki/Neil_Abercrombie
1	0	1	Rep	Gary	L.	Ackerman	NaN	NaN	D	NY	...	http://ackerman.house.gov/	http://www.house.gov/writerep	2111 Rayburn House Office Building	A000022	26970.0	H4NY07011	400003	N00001143	repgaryackerman	http://www.opencongress.org/wiki/Gary_Ackerman
2	1	2	Rep	Robert	B.	Aderholt	NaN	NaN	R	AL	...	https://aderholt.house.gov	http://aderholt.house.gov/email-me2/	235 Cannon House Office Building	A000055	441.0	H6AL04098	400004	N00003028	Robert_Aderholt	http://www.opencongress.org/wiki/Robert_Aderholt
3	0	3	Sen	Daniel	Kahikina	Akaka	NaN	NaN	D	HI	...	http://akaka.senate.gov	http://www.akaka.senate.gov/email-senator-akak...	141 Hart Senate Office Building	A000069	53286.0	S0HI00084	300001	N00007653	NaN	http://www.opencongress.org/wiki/Daniel_Akaka
4	1	4	Sen	Wayne	A.	Allard	NaN	NaN	R	CO	...	http://allard.senate.gov	NaN	NaN	A000109	26783.0	S6CO00168	300003	N00009082	NaN	http://www.opencongress.org/wiki/Wayne_Allard

	R	Unnamed: 0	title_x	firstname	middlename	lastname	name_suffix	nickname	party_x	state_x	...	veterans	village	villages	water	whaling	wilderness	wildlife	wind	yeas	youth
82	0	86	Sen	Mark	NaN	Begich	NaN	NaN	D	AK	...	0.051424	0.091710	0.124813	0.000000	0.000000	0.000000	0.074392	0.000000	0.000000	0.000000
472	1	559	Sen	Lisa	A.	Murkowski	NaN	NaN	R	AK	...	0.000000	0.095305	0.129706	0.068622	0.000000	0.085929	0.077309	0.085929	0.000000	0.090092
628	1	750	Sen	Ted	F.	Stevens	NaN	NaN	R	AK	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.089775	0.080769	0.000000	0.106864	0.000000
753	1	889	Rep	Don	E.	Young	NaN	NaN	R	AK	...	0.000000	0.000000	0.000000	0.000000	0.124897	0.082743	0.074442	0.000000	0.000000	0.000000

	R	Unnamed: 0	title_x	firstname	middlename	lastname	name_suffix	nickname	party_x	state_x	...
266	0	312	Rep	Charles	A.	Gonzalez	NaN	Charlie	D	TX	...
560	0	672	Rep	Nick	J.	Rahall	II	NaN	D	WV	...
753	1	889	Rep	Don	E.	Young	NaN	NaN	R	AK	...

	R	Unnamed: 0	title_x	firstname	middlename	lastname	name_suffix	nickname	party_x	state_x	...	veterans	village	villages	water	whaling	wilderness	wildlife	wind	yeas	youth
82	0	86	Sen	Mark	NaN	Begich	NaN	NaN	D	AK	...	1.0	0.962272	0.962272	0.0	0.0	0.000000	0.921049	0.0	0.0	0.0
472	1	559	Sen	Lisa	A.	Murkowski	NaN	NaN	R	AK	...	0.0	1.000000	1.000000	1.0	0.0	0.957161	0.957161	1.0	0.0	1.0
628	1	750	Sen	Ted	F.	Stevens	NaN	NaN	R	AK	...	0.0	0.000000	0.000000	0.0	0.0	1.000000	1.000000	0.0	1.0	0.0
753	1	889	Rep	Don	E.	Young	NaN	NaN	R	AK	...	0.0	0.000000	0.000000	0.0	1.0	0.921669	0.921669	0.0	0.0	0.0

	R	Unnamed: 0	title_x	firstname	middlename	lastname	name_suffix	nickname	party_x	state_x	...
1	0	1	Rep	Gary	L.	Ackerman	NaN	NaN	D	NY	...
11	0	11	Rep	Michael	A.	Arcuri	NaN	NaN	D	NY	...
61	0	65	Rep	Timothy	H.	Bishop	NaN	NaN	D	NY	...
93	1	97	Rep	Ann Marie	NaN	Buerkle	NaN	NaN	R	NY	...
124	0	146	Rep	Joseph	NaN	Crowley	NaN	Joe	D	NY	...

	aberdeen	abernathy	abilene	abilities	ability	abilityone	abington	able	able-bodied	abm	...	ziegler	zimbabwe	zimmer	zinc	zion	zoberman	zone	zones	zoo	zuni
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	aberdeen	abernathy	abilene	abilities	ability	abilityone	abington	able	able-bodied	abm	...	ziegler	zimbabwe	zimmer	zinc	zion	zoberman	zone	zones	zoo	zuni
aberdeen	1.000000	-0.002277	-0.002277	-0.002277	-0.002277	-0.003948	-0.002277	-0.006016	-0.002277	-0.002277	...	-0.002277	-0.003223	-0.002277	-0.002277	-0.002277	-0.003222	-0.003211	-0.002277	-0.003221	-0.002277
abernathy	-0.002277	1.000000	-0.001318	-0.001318	-0.001318	-0.002284	-0.001318	-0.003480	-0.001318	-0.001318	...	-0.001318	-0.001864	-0.001318	-0.001318	-0.001318	-0.001864	-0.001858	-0.001318	-0.001864	-0.001318
abilene	-0.002277	-0.001318	1.000000	-0.001318	-0.001318	-0.002284	-0.001318	-0.003480	-0.001318	-0.001318	...	-0.001318	-0.001864	-0.001318	-0.001318	-0.001318	-0.001864	-0.001858	-0.001318	-0.001864	-0.001318
abilities	-0.002277	-0.001318	-0.001318	1.000000	-0.001318	-0.002284	-0.001318	-0.003480	-0.001318	-0.001318	...	-0.001318	-0.001864	-0.001318	-0.001318	-0.001318	-0.001864	-0.001858	-0.001318	-0.001864	-0.001318
ability	-0.002277	-0.001318	-0.001318	-0.001318	1.000000	-0.002284	-0.001318	0.447158	-0.001318	-0.001318	...	-0.001318	-0.001864	-0.001318	-0.001318	-0.001318	-0.001864	-0.001858	-0.001318	-0.001864	-0.001318

	component_1	component_2	component_3	component_4	component_5	component_6	component_7	component_8	component_9	component_10	...	component_751	component_752	component_753	component_754	component_755	component_756	component_757	component_758	component_759	component_760
0	-0.220044	-0.449690	0.100950	-0.832581	0.219606	-0.467335	0.956750	0.545296	-0.340359	0.119844	...	0.113568	-0.006057	0.008048	-0.122832	0.109533	-0.053620	0.052741	-0.047402	0.015401	1.515012e-14
1	0.138437	-1.602499	1.161407	-1.129002	-2.914940	0.627202	1.168848	0.004197	2.617375	-0.281435	...	-0.284056	0.077545	0.038633	0.064460	0.108532	-0.164170	-0.016991	-0.094365	0.011284	1.515012e-14
2	-0.867362	-0.680678	-0.810129	-0.662719	0.040655	-0.542018	-0.225643	0.616752	-0.661218	-0.966489	...	0.083775	0.066818	-0.059395	-0.055435	0.075082	0.000091	0.005481	-0.035931	-0.012672	1.515012e-14
3	1.314171	0.209269	0.522750	-1.274981	1.279825	0.816029	0.246104	0.276451	-0.277542	0.198625	...	0.091743	-0.084369	0.061380	-0.245269	0.239200	0.019961	-0.087226	-0.034727	0.002394	1.515012e-14
4	2.537292	1.965719	-0.582027	-2.657319	-0.062524	-0.268063	0.407101	0.573592	0.360400	0.506837	...	-0.000553	0.154048	0.180420	-0.328523	-0.074689	0.193437	0.066049	0.004390	0.016395	1.515012e-14

	0	1	2	3	4	5	6	7	8	9	...	13550	13551	13552	13553	13554	13555	13556	13557	13558	13559
component_1	0.000083	-0.000145	0.000077	-0.001005	-0.000916	-0.002106	0.000636	0.003440	-0.002305	0.001842	...	-0.000971	0.000044	-0.000971	-0.000554	-0.000541	-0.000807	0.000856	0.000722	-0.001109	-0.000554
component_2	-0.000089	-0.001345	-0.000270	-0.000259	0.000031	-0.001987	-0.000477	0.003825	0.002468	0.001834	...	-0.000574	-0.003957	-0.000574	-0.000301	-0.000934	-0.002164	-0.000004	0.000885	-0.002601	-0.000512
component_3	-0.000134	0.001348	-0.001406	-0.000852	-0.000445	-0.002034	0.000626	-0.000247	0.001749	-0.000668	...	-0.001044	0.003421	-0.001044	-0.000885	-0.000143	-0.000165	-0.001209	-0.000763	0.000741	-0.000859
component_4	-0.000015	0.000519	0.001321	0.000276	-0.000438	0.000543	0.001994	0.003334	-0.000366	-0.003675	...	-0.000069	-0.002044	-0.000069	0.000547	-0.001042	-0.000828	-0.001653	0.001370	0.000042	-0.001903
component_5	0.001052	-0.000505	-0.001242	0.000327	0.000413	0.001026	-0.000658	0.002251	-0.000059	0.003534	...	0.000134	-0.007154	0.000134	-0.001067	-0.000988	-0.001702	-0.001243	0.000470	0.001377	0.002673

	aberdeen	abernathy	abilene	abilities	ability	abilityone	abington	able	able-bodied	abm	...	ziegler	zimbabwe	zimmer	zinc	zion	zoberman	zone	zones	zoo	zuni
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	class_0	class_1	predicted	actual
0	0.236	0.764	1.0	0.0
1	0.935	0.065	0.0	0.0
2	0.041	0.959	1.0	1.0
3	0.956	0.044	0.0	0.0
4	0.894	0.106	0.0	0.0

	class_0	class_1	predicted	actual
0	0.32	0.68	1.0	0.0
1	0.62	0.38	0.0	0.0
2	0.62	0.38	0.0	1.0
3	0.56	0.44	0.0	0.0
4	0.62	0.38	0.0	0.0

	aberdeen	abernathy	abilene	abilities	ability	abilityone	abington	able	able-bodied	abm	...	ziegler	zimbabwe	zimmer	zinc	zion	zoberman	zone	zones	zoo	zuni
0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0