I start by importing the basic libraries and the data from a file.
In [1]:
import pandas as pd
import numpy as np
In [2]:
legislatorsData = pd.read_csv("../data/legislators.csv")
legislatorsData.head()
legislatorsData.columns
legislators = pd.DataFrame(legislatorsData)
legislators.head()
Out[2]:
In [ ]:
In [3]:
from urllib2 import Request, urlopen
import json
from pandas.io.json import json_normalize
In [4]:
def requestWords( id ):
id = str(id)
url = "http://capitolwords.org/api/1/phrases.json?entity_type=legislator&entity_value="+id+"&apikey=0bf8e7eb6ce146f48217bfee767c998d"
request=Request(url)
response = urlopen(request)
contents = response.read()
len(contents)
if len(contents) > 2:
data = json.loads(contents)
words = json_normalize(data)
list_of_words = words.ngram.tolist()
string_of_words ="|".join(list_of_words)
return string_of_words
else:
return np.nan
In [5]:
legislators['favorite_words'] = legislators.apply(lambda row: requestWords(row['bioguide_id']),axis=1)
In [6]:
print legislators.favorite_words.head(3)
print "All entries before getting rid of entris with no words:", len(legislators.favorite_words)
In [7]:
legislators_words = legislators[legislators.favorite_words.notnull()]
In [8]:
print "Number of legislators with word record:", len(legislators_words.favorite_words)
In [9]:
favorite_words = legislators_words.favorite_words.str.get_dummies(sep = "|")
print favorite_words.head(3)
favorite_words.columns[:100]
Out[9]:
In [10]:
favorite_words.shape
Out[10]:
In [11]:
favorite_words.columns[260:300]
Out[11]:
In [12]:
favorite_words.columns[760:800]
Out[12]:
In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer
Out[13]:
In [14]:
corpus = favorite_words.columns.tolist()
corpus[:3]
Out[14]:
In [15]:
X = vectorizer.fit_transform(corpus)
In [16]:
analyze = vectorizer.build_analyzer()
print analyze("economy a this")
vectorizer.get_feature_names()[910:920]
Out[16]:
In [17]:
vectorizer.vocabulary_.get('document') #not seen in the training corpus will be completely ignored in future calls to the transform method
Out[17]:
In [18]:
vectorizer.transform(['Something completely unrelated']).toarray()
Out[18]:
In [19]:
from sklearn.feature_extraction.text import TfidfTransformer
In [20]:
transformer = TfidfTransformer()
transformer
Out[20]:
In [21]:
tfidf = transformer.fit_transform(favorite_words)
tfidf_array = tfidf.toarray()
tfidf_array.shape
tfidf_array[20].max()
transformer.idf_
Out[21]:
In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)
vec_idf = vectorizer.idf_
print len(vec_idf)
In [23]:
words_weight = pd.DataFrame(tfidf_array, index=legislators_words.index , columns=corpus)
print legislators_words.index
print words_weight.index
In [24]:
capitol_words = legislators_words.merge(words_weight, right_index=True, left_index=True)
In [25]:
capitol_words.head()
Out[25]:
In [26]:
del capitol_words["a"]
column_names_capitol = capitol_words.columns.tolist()
word_column_names = column_names_capitol[806:]
number_column_names = column_names_capitol[30:805]
In [27]:
capitol_words[word_column_names].head()
Out[27]:
In [28]:
capitol_words[word_column_names].sum().max()
Out[28]:
In [29]:
total_word_counts = capitol_words[word_column_names].sum()
max_sum_col = total_word_counts[total_word_counts==total_word_counts.max()]
max_sum_col.index[0]
Out[29]:
In [30]:
print "Words before cleanse for maximizing variance:", len(word_column_names)
In [31]:
def percentage_global_mention(word_column_names):
for word in word_column_names:
not_mentioned_mask = capitol_words[word_column_names][word]==0.0
not_mentioned_count = capitol_words[word_column_names][word].mask(not_mentioned_mask).count()
index_count = capitol_words.count()[0]
percentage = float(not_mentioned_count)/index_count
if percentage > 0.95:
print percentage , word
del capitol_words[word]
else:
print "Nothing to worry about"
In [32]:
word_frequencies = (capitol_words[word_column_names]>0).astype(int).sum(axis=0).astype(float)/capitol_words.shape[0]
most_frequent_words = word_frequencies[word_frequencies>.95].index
most_frequent_words
Out[32]:
In [33]:
word_frequencies = (capitol_words[word_column_names]>0).astype(int).sum(axis=0)
word_frequencies.max()
Out[33]:
In [34]:
capitol_words.party_x.unique()
party_mask = capitol_words.party_x!="I"
two_party_words = capitol_words[party_mask]
print "Entries before getting rid of independents:", capitol_words.shape[0]
print "Entries after getting rid of independents:", two_party_words.shape[0]
print "Number of independents:", (capitol_words.shape[0])-(two_party_words.shape[0])
In [35]:
two_party_words.to_csv(path_or_buf="../data/two.csv")