Cleaning the Data and creating a sparce matrix

I start by importing the basic libraries and the data from a file.



In [1]:

    
import pandas as pd
import numpy as np

This is what the data looks like:



In [2]:

    
legislatorsData = pd.read_csv("../data/legislators.csv")
legislatorsData.head()
legislatorsData.columns
legislators = pd.DataFrame(legislatorsData)
legislators.head()









    Out[2]:






  
    
      
      title
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party
      state
      district
      in_office
      ...
      govtrack_id
      crp_id
      twitter_id
      congresspedia_url
      youtube_url
      facebook_id
      official_rss
      senate_class
      birthdate
      oc_email
    
  
  
    
      0
      Rep
      Neil
      NaN
      Abercrombie
      NaN
      NaN
      D
      HI
      1
      0
      ...
      400001
      N00007665
      neilabercrombie
      http://www.opencongress.org/wiki/Neil_Abercrombie
      http://youtube.com/hawaiirep1
      NaN
      NaN
      NaN
      1938-06-26
      NaN
    
    
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      5
      0
      ...
      400003
      N00001143
      repgaryackerman
      http://www.opencongress.org/wiki/Gary_Ackerman
      http://youtube.com/RepAckerman
      RepAcherman
      NaN
      NaN
      1942-11-19
      NaN
    
    
      2
      Rep
      Robert
      B.
      Aderholt
      NaN
      NaN
      R
      AL
      4
      1
      ...
      400004
      N00003028
      Robert_Aderholt
      http://www.opencongress.org/wiki/Robert_Aderholt
      http://youtube.com/RobertAderholt
      19787529402
      NaN
      NaN
      1965-07-22
      Rep.Aderholt@opencongress.org
    
    
      3
      Sen
      Daniel
      Kahikina
      Akaka
      NaN
      NaN
      D
      HI
      Junior Seat
      0
      ...
      300001
      N00007653
      NaN
      http://www.opencongress.org/wiki/Daniel_Akaka
      http://youtube.com/senatorakaka
      danielakaka
      NaN
      I
      1924-09-11
      NaN
    
    
      4
      Sen
      Wayne
      A.
      Allard
      NaN
      NaN
      R
      CO
      Senior Seat
      0
      ...
      300003
      N00009082
      NaN
      http://www.opencongress.org/wiki/Wayne_Allard
      NaN
      NaN
      NaN
      II
      1943-12-02
      NaN
    
  

5 rows × 29 columns



In [ ]:



In [3]:

    
from urllib2 import Request, urlopen
import json
from pandas.io.json import json_normalize



In [4]:

    
def requestWords( id ):
    id = str(id)
    url = "http://capitolwords.org/api/1/phrases.json?entity_type=legislator&entity_value="+id+"&apikey=0bf8e7eb6ce146f48217bfee767c998d"
    request=Request(url)
    response = urlopen(request)
    contents = response.read()
    len(contents)
    if len(contents) > 2:
        data = json.loads(contents)
        words = json_normalize(data)
        list_of_words = words.ngram.tolist()
        string_of_words ="|".join(list_of_words)
        return string_of_words
    else:
        return np.nan



In [5]:

    
legislators['favorite_words'] = legislators.apply(lambda row: requestWords(row['bioguide_id']),axis=1)



In [6]:

    
print legislators.favorite_words.head(3)
print "All entries before getting rid of entris with no words:", len(legislators.favorite_words)









    



0    hawaiian|hawaii|hawaiians|hawaii's|kalaupapa|e...
1    queens|rabbi|jewish|bayside|flushing|nassau|br...
2    aderholt|requesting|irons|huntsville|alabama|r...
Name: favorite_words, dtype: object
All entries before getting rid of entris with no words: 897



In [7]:

    
legislators_words = legislators[legislators.favorite_words.notnull()]



In [8]:

    
print "Number of legislators with word record:", len(legislators_words.favorite_words)









    



Number of legislators with word record: 763



In [9]:

    
favorite_words = legislators_words.favorite_words.str.get_dummies(sep = "|")
print favorite_words.head(3)
favorite_words.columns[:100]









    



   $0  $1  $1.50  $10  $100  $1000  $100000  $1000000  $10638425746293  $107  \
0   0   0      0    0     0      0        0         0                0     0   
1   0   0      0    0     0      0        0         0                0     0   
2   0   0      0    0     0      0        0         0                0     0   

   ...   ziegler  zimbabwe  zimmer  zinc  zion  zoberman  zone  zones  zoo  \
0  ...         0         0       0     0     0         0     0      0    0   
1  ...         0         0       0     0     0         0     0      0    0   
2  ...         0         0       0     0     0         0     0      0    0   

   zuni  
0     0  
1     0  
2     0  

[3 rows x 14420 columns]






    Out[9]:





Index([u'$0', u'$1', u'$1.50', u'$10', u'$100', u'$1000', u'$100000',
       u'$1000000', u'$10638425746293', u'$107', u'$12', u'$120', u'$12000',
       u'$13', u'$1300', u'$139', u'$14', u'$1400', u'$15', u'$1500',
       u'$150000', u'$1500000', u'$159', u'$1600', u'$17', u'$170', u'$18',
       u'$186', u'$19', u'$191', u'$2', u'$2.33', u'$200', u'$2000',
       u'$200000', u'$2000000', u'$21', u'$23', u'$23000', u'$236', u'$240',
       u'$25', u'$250', u'$250000', u'$2500000', u'$270', u'$27000', u'$290',
       u'$29000', u'$3', u'$300', u'$3000', u'$30000', u'$300000', u'$30500',
       u'$310', u'$319', u'$3300', u'$35', u'$350', u'$35000', u'$350000',
       u'$38', u'$4', u'$4.50', u'$400', u'$400000', u'$45', u'$46', u'$464',
       u'$5', u'$50', u'$500', u'$5000', u'$50000', u'$500000', u'$5100000',
       u'$58', u'$58000', u'$6', u'$60', u'$600', u'$600000', u'$6000000',
       u'$61', u'$683', u'$700', u'$713', u'$730', u'$750', u'$750000',
       u'$760', u'$787', u'$8', u'$80', u'$800', u'$81', u'$87', u'$90',
       u'$900'],
      dtype='object')



In [10]:

    
favorite_words.shape









    Out[10]:





(763, 14420)



In [11]:

    
favorite_words.columns[260:300]









    Out[11]:





Index([u'1728', u'174', u'174000', u'1741', u'175', u'1750', u'177', u'1771',
       u'1775', u'178', u'1782', u'18', u'180', u'1803', u'1805', u'1808',
       u'1812', u'182', u'1821', u'184', u'1849', u'185', u'1860', u'1861',
       u'1862', u'1863', u'1872', u'1873', u'188', u'1886', u'189', u'1894',
       u'1898', u'19', u'1900', u'1909', u'1910', u'19107', u'1911', u'1915'],
      dtype='object')



In [12]:

    
favorite_words.columns[760:800]









    Out[12]:





Index([u'944', u'95', u'952', u'953', u'96', u'964', u'97', u'98', u'9800',
       u'9896', u'990', u'991', u'9946', u'999', u'9:30', u'?', u'a', u'a&m',
       u'a-plus', u'a.', u'a.d.', u'a.m.', u'a.m.e.', u'aaa', u'aacute',
       u'aahsa', u'aamodt', u'aana', u'aapg', u'aapi', u'aapis', u'aaron',
       u'aarp', u'abandon', u'abandoned', u'abaya', u'abbas', u'abbas's',
       u'abbeville', u'abby'],
      dtype='object')



In [13]:

    
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer









    Out[13]:





CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Corpus

I used the whole must repited words to have more of a global tf-idf



In [14]:

    
corpus = favorite_words.columns.tolist()
corpus[:3]









    Out[14]:





[u'$0', u'$1', u'$1.50']



In [15]:

    
X = vectorizer.fit_transform(corpus)



In [16]:

    
analyze = vectorizer.build_analyzer()
print analyze("economy a this")
vectorizer.get_feature_names()[910:920]









    



[u'economy', u'this']






    Out[16]:





[u'africans',
 u'after',
 u'aftermath',
 u'afternoon',
 u'afterschool',
 u'afterward',
 u'ag',
 u'again',
 u'against',
 u'agana']



In [17]:

    
vectorizer.vocabulary_.get('document') #not seen in the training corpus will be completely ignored in future calls to the transform method









    Out[17]:





4357



In [18]:

    
vectorizer.transform(['Something completely unrelated']).toarray()









    Out[18]:





array([[0, 0, 0, ..., 0, 0, 0]])

Biagram

not sure if i shoud add Biagrams since the corpus is made up single words and a lot of of number dropping politicians love to do.

Transformer



In [19]:

    
from sklearn.feature_extraction.text import TfidfTransformer



In [20]:

    
transformer = TfidfTransformer()
transformer









    Out[20]:





TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False,
         use_idf=True)



In [21]:

    
tfidf = transformer.fit_transform(favorite_words)
tfidf_array = tfidf.toarray()
tfidf_array.shape
tfidf_array[20].max()
transformer.idf_









    Out[21]:





array([ 6.25227343,  4.93051759,  6.94542061, ...,  6.94542061,
        6.5399555 ,  6.94542061])

Vectorizer



In [22]:

    
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)
vec_idf = vectorizer.idf_
print len(vec_idf)



In [23]:

    
words_weight = pd.DataFrame(tfidf_array, index=legislators_words.index , columns=corpus)
print legislators_words.index
print words_weight.index









    



Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            886, 887, 888, 889, 890, 891, 892, 893, 894, 896],
           dtype='int64', length=763)
Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            886, 887, 888, 889, 890, 891, 892, 893, 894, 896],
           dtype='int64', length=763)



In [24]:

    
capitol_words = legislators_words.merge(words_weight, right_index=True, left_index=True)



In [25]:

    
capitol_words.head()









    Out[25]:






  
    
      
      title_x
      firstname
      middlename
      lastname
      name_suffix
      nickname
      party_x
      state_x
      district_x
      in_office
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      0
      Rep
      Neil
      NaN
      Abercrombie
      NaN
      NaN
      D
      HI
      1
      0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      Rep
      Gary
      L.
      Ackerman
      NaN
      NaN
      D
      NY
      5
      0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      Rep
      Robert
      B.
      Aderholt
      NaN
      NaN
      R
      AL
      4
      1
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      Sen
      Daniel
      Kahikina
      Akaka
      NaN
      NaN
      D
      HI
      Junior Seat
      0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      Sen
      Wayne
      A.
      Allard
      NaN
      NaN
      R
      CO
      Senior Seat
      0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 14450 columns



In [26]:

    
del capitol_words["a"]
column_names_capitol = capitol_words.columns.tolist()
word_column_names = column_names_capitol[806:]
number_column_names = column_names_capitol[30:805]



In [27]:

    
capitol_words[word_column_names].head()









    Out[27]:






  
    
      
      a&m
      a-plus
      a.
      a.d.
      a.m.
      a.m.e.
      aaa
      aacute
      aahsa
      aamodt
      ...
      ziegler
      zimbabwe
      zimmer
      zinc
      zion
      zoberman
      zone
      zones
      zoo
      zuni
    
  
  
    
      0
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      1
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      2
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      3
      0.0
      0.0
      0.0
      0.0
      0.000000
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.096384
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
  

5 rows × 13643 columns



In [28]:

    
capitol_words[word_column_names].sum().max()









    Out[28]:





12.815564347877871

Now I will make another DataFrame removing the words with the max counts



In [29]:

    
total_word_counts = capitol_words[word_column_names].sum()
max_sum_col = total_word_counts[total_word_counts==total_word_counts.max()]
max_sum_col.index[0]









    Out[29]:





u'iraq'



In [30]:

    
print "Words before cleanse for maximizing variance:", len(word_column_names)









    



Words before cleanse for maximizing variance: 13643

Getting rid of the words that 95% of the people said

Because they don't add much in to analyzing what makes legislators different from one another.



In [31]:

    
def percentage_global_mention(word_column_names):
    for word in word_column_names:
        not_mentioned_mask = capitol_words[word_column_names][word]==0.0
        not_mentioned_count = capitol_words[word_column_names][word].mask(not_mentioned_mask).count()
        index_count = capitol_words.count()[0]
        percentage = float(not_mentioned_count)/index_count
        if percentage > 0.95:
            print percentage , word
            del capitol_words[word]
        else:
            print "Nothing to worry about"

Or a better way to do it:

credits to Sergey



In [32]:

    
word_frequencies = (capitol_words[word_column_names]>0).astype(int).sum(axis=0).astype(float)/capitol_words.shape[0]
most_frequent_words = word_frequencies[word_frequencies>.95].index
most_frequent_words









    Out[32]:





Index([], dtype='object')



In [33]:

    
word_frequencies = (capitol_words[word_column_names]>0).astype(int).sum(axis=0)
word_frequencies.max()









    Out[33]:





323



In [34]:

    
capitol_words.party_x.unique()
party_mask = capitol_words.party_x!="I"
two_party_words = capitol_words[party_mask]
print "Entries before getting rid of independents:", capitol_words.shape[0]
print "Entries after getting rid of independents:", two_party_words.shape[0]
print "Number of independents:", (capitol_words.shape[0])-(two_party_words.shape[0])









    



Entries before getting rid of independents: 763
Entries after getting rid of independents: 760
Number of independents: 3



In [35]:

    
two_party_words.to_csv(path_or_buf="../data/two.csv")

	title	firstname	middlename	lastname	name_suffix	nickname	party	state	district	in_office	...	govtrack_id	crp_id	twitter_id	congresspedia_url	youtube_url	facebook_id	official_rss	senate_class	birthdate	oc_email
0	Rep	Neil	NaN	Abercrombie	NaN	NaN	D	HI	1	0	...	400001	N00007665	neilabercrombie	http://www.opencongress.org/wiki/Neil_Abercrombie	http://youtube.com/hawaiirep1	NaN	NaN	NaN	1938-06-26	NaN
1	Rep	Gary	L.	Ackerman	NaN	NaN	D	NY	5	0	...	400003	N00001143	repgaryackerman	http://www.opencongress.org/wiki/Gary_Ackerman	http://youtube.com/RepAckerman	RepAcherman	NaN	NaN	1942-11-19	NaN
2	Rep	Robert	B.	Aderholt	NaN	NaN	R	AL	4	1	...	400004	N00003028	Robert_Aderholt	http://www.opencongress.org/wiki/Robert_Aderholt	http://youtube.com/RobertAderholt	19787529402	NaN	NaN	1965-07-22	Rep.Aderholt@opencongress.org
3	Sen	Daniel	Kahikina	Akaka	NaN	NaN	D	HI	Junior Seat	0	...	300001	N00007653	NaN	http://www.opencongress.org/wiki/Daniel_Akaka	http://youtube.com/senatorakaka	danielakaka	NaN	I	1924-09-11	NaN
4	Sen	Wayne	A.	Allard	NaN	NaN	R	CO	Senior Seat	0	...	300003	N00009082	NaN	http://www.opencongress.org/wiki/Wayne_Allard	NaN	NaN	NaN	II	1943-12-02	NaN

	a.m.	...
0	0.000000	...
1	0.000000	...
2	0.000000	...
3	0.000000	...
4	0.096384	...