Cleaning the Data and creating a sparce matrix

I start by importing the basic libraries and the data from a file.


In [1]:
import pandas as pd
import numpy as np

This is what the data looks like:


In [2]:
legislatorsData = pd.read_csv("../data/legislators.csv")
legislatorsData.head()
legislatorsData.columns
legislators = pd.DataFrame(legislatorsData)
legislators.head()


Out[2]:
title firstname middlename lastname name_suffix nickname party state district in_office ... govtrack_id crp_id twitter_id congresspedia_url youtube_url facebook_id official_rss senate_class birthdate oc_email
0 Rep Neil NaN Abercrombie NaN NaN D HI 1 0 ... 400001 N00007665 neilabercrombie http://www.opencongress.org/wiki/Neil_Abercrombie http://youtube.com/hawaiirep1 NaN NaN NaN 1938-06-26 NaN
1 Rep Gary L. Ackerman NaN NaN D NY 5 0 ... 400003 N00001143 repgaryackerman http://www.opencongress.org/wiki/Gary_Ackerman http://youtube.com/RepAckerman RepAcherman NaN NaN 1942-11-19 NaN
2 Rep Robert B. Aderholt NaN NaN R AL 4 1 ... 400004 N00003028 Robert_Aderholt http://www.opencongress.org/wiki/Robert_Aderholt http://youtube.com/RobertAderholt 19787529402 NaN NaN 1965-07-22 Rep.Aderholt@opencongress.org
3 Sen Daniel Kahikina Akaka NaN NaN D HI Junior Seat 0 ... 300001 N00007653 NaN http://www.opencongress.org/wiki/Daniel_Akaka http://youtube.com/senatorakaka danielakaka NaN I 1924-09-11 NaN
4 Sen Wayne A. Allard NaN NaN R CO Senior Seat 0 ... 300003 N00009082 NaN http://www.opencongress.org/wiki/Wayne_Allard NaN NaN NaN II 1943-12-02 NaN

5 rows × 29 columns


In [ ]:


In [3]:
from urllib2 import Request, urlopen
import json
from pandas.io.json import json_normalize

In [4]:
def requestWords( id ):
    id = str(id)
    url = "http://capitolwords.org/api/1/phrases.json?entity_type=legislator&entity_value="+id+"&apikey=0bf8e7eb6ce146f48217bfee767c998d"
    request=Request(url)
    response = urlopen(request)
    contents = response.read()
    len(contents)
    if len(contents) > 2:
        data = json.loads(contents)
        words = json_normalize(data)
        list_of_words = words.ngram.tolist()
        string_of_words ="|".join(list_of_words)
        return string_of_words
    else:
        return np.nan

In [5]:
legislators['favorite_words'] = legislators.apply(lambda row: requestWords(row['bioguide_id']),axis=1)

In [6]:
print legislators.favorite_words.head(3)
print "All entries before getting rid of entris with no words:", len(legislators.favorite_words)


0    hawaiian|hawaii|hawaiians|hawaii's|kalaupapa|e...
1    queens|rabbi|jewish|bayside|flushing|nassau|br...
2    aderholt|requesting|irons|huntsville|alabama|r...
Name: favorite_words, dtype: object
All entries before getting rid of entris with no words: 897

In [7]:
legislators_words = legislators[legislators.favorite_words.notnull()]

In [8]:
print "Number of legislators with word record:", len(legislators_words.favorite_words)


Number of legislators with word record: 763

In [9]:
favorite_words = legislators_words.favorite_words.str.get_dummies(sep = "|")
print favorite_words.head(3)
favorite_words.columns[:100]


   $0  $1  $1.50  $10  $100  $1000  $100000  $1000000  $10638425746293  $107  \
0   0   0      0    0     0      0        0         0                0     0   
1   0   0      0    0     0      0        0         0                0     0   
2   0   0      0    0     0      0        0         0                0     0   

   ...   ziegler  zimbabwe  zimmer  zinc  zion  zoberman  zone  zones  zoo  \
0  ...         0         0       0     0     0         0     0      0    0   
1  ...         0         0       0     0     0         0     0      0    0   
2  ...         0         0       0     0     0         0     0      0    0   

   zuni  
0     0  
1     0  
2     0  

[3 rows x 14420 columns]
Out[9]:
Index([u'$0', u'$1', u'$1.50', u'$10', u'$100', u'$1000', u'$100000',
       u'$1000000', u'$10638425746293', u'$107', u'$12', u'$120', u'$12000',
       u'$13', u'$1300', u'$139', u'$14', u'$1400', u'$15', u'$1500',
       u'$150000', u'$1500000', u'$159', u'$1600', u'$17', u'$170', u'$18',
       u'$186', u'$19', u'$191', u'$2', u'$2.33', u'$200', u'$2000',
       u'$200000', u'$2000000', u'$21', u'$23', u'$23000', u'$236', u'$240',
       u'$25', u'$250', u'$250000', u'$2500000', u'$270', u'$27000', u'$290',
       u'$29000', u'$3', u'$300', u'$3000', u'$30000', u'$300000', u'$30500',
       u'$310', u'$319', u'$3300', u'$35', u'$350', u'$35000', u'$350000',
       u'$38', u'$4', u'$4.50', u'$400', u'$400000', u'$45', u'$46', u'$464',
       u'$5', u'$50', u'$500', u'$5000', u'$50000', u'$500000', u'$5100000',
       u'$58', u'$58000', u'$6', u'$60', u'$600', u'$600000', u'$6000000',
       u'$61', u'$683', u'$700', u'$713', u'$730', u'$750', u'$750000',
       u'$760', u'$787', u'$8', u'$80', u'$800', u'$81', u'$87', u'$90',
       u'$900'],
      dtype='object')

In [10]:
favorite_words.shape


Out[10]:
(763, 14420)

In [11]:
favorite_words.columns[260:300]


Out[11]:
Index([u'1728', u'174', u'174000', u'1741', u'175', u'1750', u'177', u'1771',
       u'1775', u'178', u'1782', u'18', u'180', u'1803', u'1805', u'1808',
       u'1812', u'182', u'1821', u'184', u'1849', u'185', u'1860', u'1861',
       u'1862', u'1863', u'1872', u'1873', u'188', u'1886', u'189', u'1894',
       u'1898', u'19', u'1900', u'1909', u'1910', u'19107', u'1911', u'1915'],
      dtype='object')

In [12]:
favorite_words.columns[760:800]


Out[12]:
Index([u'944', u'95', u'952', u'953', u'96', u'964', u'97', u'98', u'9800',
       u'9896', u'990', u'991', u'9946', u'999', u'9:30', u'?', u'a', u'a&m',
       u'a-plus', u'a.', u'a.d.', u'a.m.', u'a.m.e.', u'aaa', u'aacute',
       u'aahsa', u'aamodt', u'aana', u'aapg', u'aapi', u'aapis', u'aaron',
       u'aarp', u'abandon', u'abandoned', u'abaya', u'abbas', u'abbas's',
       u'abbeville', u'abby'],
      dtype='object')

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer


Out[13]:
CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Corpus

I used the whole must repited words to have more of a global tf-idf


In [14]:
corpus = favorite_words.columns.tolist()
corpus[:3]


Out[14]:
[u'$0', u'$1', u'$1.50']

In [15]:
X = vectorizer.fit_transform(corpus)

In [16]:
analyze = vectorizer.build_analyzer()
print analyze("economy a this")
vectorizer.get_feature_names()[910:920]


[u'economy', u'this']
Out[16]:
[u'africans',
 u'after',
 u'aftermath',
 u'afternoon',
 u'afterschool',
 u'afterward',
 u'ag',
 u'again',
 u'against',
 u'agana']

In [17]:
vectorizer.vocabulary_.get('document') #not seen in the training corpus will be completely ignored in future calls to the transform method


Out[17]:
4357

In [18]:
vectorizer.transform(['Something completely unrelated']).toarray()


Out[18]:
array([[0, 0, 0, ..., 0, 0, 0]])

Biagram

not sure if i shoud add Biagrams since the corpus is made up single words and a lot of of number dropping politicians love to do.

Transformer


In [19]:
from sklearn.feature_extraction.text import TfidfTransformer

In [20]:
transformer = TfidfTransformer()
transformer


Out[20]:
TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False,
         use_idf=True)

In [21]:
tfidf = transformer.fit_transform(favorite_words)
tfidf_array = tfidf.toarray()
tfidf_array.shape
tfidf_array[20].max()
transformer.idf_


Out[21]:
array([ 6.25227343,  4.93051759,  6.94542061, ...,  6.94542061,
        6.5399555 ,  6.94542061])

Vectorizer


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)
vec_idf = vectorizer.idf_
print len(vec_idf)


13670

In [23]:
words_weight = pd.DataFrame(tfidf_array, index=legislators_words.index , columns=corpus)
print legislators_words.index
print words_weight.index


Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            886, 887, 888, 889, 890, 891, 892, 893, 894, 896],
           dtype='int64', length=763)
Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            886, 887, 888, 889, 890, 891, 892, 893, 894, 896],
           dtype='int64', length=763)

In [24]:
capitol_words = legislators_words.merge(words_weight, right_index=True, left_index=True)

In [25]:
capitol_words.head()


Out[25]:
title_x firstname middlename lastname name_suffix nickname party_x state_x district_x in_office ... ziegler zimbabwe zimmer zinc zion zoberman zone zones zoo zuni
0 Rep Neil NaN Abercrombie NaN NaN D HI 1 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 Rep Gary L. Ackerman NaN NaN D NY 5 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Rep Robert B. Aderholt NaN NaN R AL 4 1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 Sen Daniel Kahikina Akaka NaN NaN D HI Junior Seat 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 Sen Wayne A. Allard NaN NaN R CO Senior Seat 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 14450 columns


In [26]:
del capitol_words["a"]
column_names_capitol = capitol_words.columns.tolist()
word_column_names = column_names_capitol[806:]
number_column_names = column_names_capitol[30:805]

In [27]:
capitol_words[word_column_names].head()


Out[27]:
a&m a-plus a. a.d. a.m. a.m.e. aaa aacute aahsa aamodt ... ziegler zimbabwe zimmer zinc zion zoberman zone zones zoo zuni
0 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0 0.000000 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.0 0.0 0.0 0.0 0.096384 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 13643 columns


In [28]:
capitol_words[word_column_names].sum().max()


Out[28]:
12.815564347877871

Now I will make another DataFrame removing the words with the max counts


In [29]:
total_word_counts = capitol_words[word_column_names].sum()
max_sum_col = total_word_counts[total_word_counts==total_word_counts.max()]
max_sum_col.index[0]


Out[29]:
u'iraq'

In [30]:
print "Words before cleanse for maximizing variance:", len(word_column_names)


Words before cleanse for maximizing variance: 13643

Getting rid of the words that 95% of the people said

Because they don't add much in to analyzing what makes legislators different from one another.


In [31]:
def percentage_global_mention(word_column_names):
    for word in word_column_names:
        not_mentioned_mask = capitol_words[word_column_names][word]==0.0
        not_mentioned_count = capitol_words[word_column_names][word].mask(not_mentioned_mask).count()
        index_count = capitol_words.count()[0]
        percentage = float(not_mentioned_count)/index_count
        if percentage > 0.95:
            print percentage , word
            del capitol_words[word]
        else:
            print "Nothing to worry about"

Or a better way to do it:

credits to Sergey


In [32]:
word_frequencies = (capitol_words[word_column_names]>0).astype(int).sum(axis=0).astype(float)/capitol_words.shape[0]
most_frequent_words = word_frequencies[word_frequencies>.95].index
most_frequent_words


Out[32]:
Index([], dtype='object')

In [33]:
word_frequencies = (capitol_words[word_column_names]>0).astype(int).sum(axis=0)
word_frequencies.max()


Out[33]:
323

In [34]:
capitol_words.party_x.unique()
party_mask = capitol_words.party_x!="I"
two_party_words = capitol_words[party_mask]
print "Entries before getting rid of independents:", capitol_words.shape[0]
print "Entries after getting rid of independents:", two_party_words.shape[0]
print "Number of independents:", (capitol_words.shape[0])-(two_party_words.shape[0])


Entries before getting rid of independents: 763
Entries after getting rid of independents: 760
Number of independents: 3

In [35]:
two_party_words.to_csv(path_or_buf="../data/two.csv")