In [2]:
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
In [ ]:
# Try to stem, lemmatize, remove stop words, and use tf/idf on subject areas and titles
In [3]:
# The predefined subjects
important_subjects = ['Antitrust', 'Banking and Finance', 'Bankruptcy', 'Corporate Mergers and Acquisitions',
'Employee Benefits', 'Health', 'Intellectual Property', 'Labor and Employment', 'Securities',
'Tax']
In [8]:
# save it as a TextBlob object
subjects = [TextBlob(s) for s in important_subjects]
In [10]:
# initialize stemmer
stemmer = SnowballStemmer('english')
# stem each word
print [stemmer.stem(word) for s in subjects for word in s.words]
[u'antitrust', u'bank', u'and', u'financ', u'bankruptci', u'corpor', u'merger', u'and', u'acquisit', u'employe', u'benefit', u'health', u'intellectu', u'properti', u'labor', u'employ', u'secur', u'tax']
In [100]:
# define a function that accepts text and returns a list of lemmas
def split_into_lemmas(text):
text = text.lower()
words = TextBlob(text).words
return [word.lemmatize() for word in words]
def split_into_lemmas2(text):
text = text.lower()
words = TextBlob(text).words
return " ".join([word.lemmatize() for word in words])
def stem_words(text):
return stemmer.stem(text)
# use split_into_lemmas as the feature extraction function (WARNING: SLOW!)
# TfidfVectorizer
#vect = TfidfVectorizer(preprocessor=stem_words, ngram_range=(1,3), stop_words='english')
vect = TfidfVectorizer(preprocessor=split_into_lemmas2, ngram_range=(1,3), stop_words='english')
In [101]:
pd.DataFrame(vect.fit_transform(important_subjects).toarray(), columns=vect.get_feature_names())
Out[101]:
acquisition
antitrust
banking
banking finance
bankruptcy
benefit
corporate
corporate merger
corporate merger acquisition
employee
...
health
intellectual
intellectual property
labor
labor employment
merger
merger acquisition
property
security
tax
0
0.000000
1.0
0.00000
0.00000
0.0
0.00000
0.000000
0.000000
0.000000
0.00000
...
0.0
0.00000
0.00000
0.00000
0.00000
0.000000
0.000000
0.00000
0.0
0.0
1
0.000000
0.0
0.57735
0.57735
0.0
0.00000
0.000000
0.000000
0.000000
0.00000
...
0.0
0.00000
0.00000
0.00000
0.00000
0.000000
0.000000
0.00000
0.0
0.0
2
0.000000
0.0
0.00000
0.00000
1.0
0.00000
0.000000
0.000000
0.000000
0.00000
...
0.0
0.00000
0.00000
0.00000
0.00000
0.000000
0.000000
0.00000
0.0
0.0
3
0.408248
0.0
0.00000
0.00000
0.0
0.00000
0.408248
0.408248
0.408248
0.00000
...
0.0
0.00000
0.00000
0.00000
0.00000
0.408248
0.408248
0.00000
0.0
0.0
4
0.000000
0.0
0.00000
0.00000
0.0
0.57735
0.000000
0.000000
0.000000
0.57735
...
0.0
0.00000
0.00000
0.00000
0.00000
0.000000
0.000000
0.00000
0.0
0.0
5
0.000000
0.0
0.00000
0.00000
0.0
0.00000
0.000000
0.000000
0.000000
0.00000
...
1.0
0.00000
0.00000
0.00000
0.00000
0.000000
0.000000
0.00000
0.0
0.0
6
0.000000
0.0
0.00000
0.00000
0.0
0.00000
0.000000
0.000000
0.000000
0.00000
...
0.0
0.57735
0.57735
0.00000
0.00000
0.000000
0.000000
0.57735
0.0
0.0
7
0.000000
0.0
0.00000
0.00000
0.0
0.00000
0.000000
0.000000
0.000000
0.00000
...
0.0
0.00000
0.00000
0.57735
0.57735
0.000000
0.000000
0.00000
0.0
0.0
8
0.000000
0.0
0.00000
0.00000
0.0
0.00000
0.000000
0.000000
0.000000
0.00000
...
0.0
0.00000
0.00000
0.00000
0.00000
0.000000
0.000000
0.00000
1.0
0.0
9
0.000000
0.0
0.00000
0.00000
0.0
0.00000
0.000000
0.000000
0.000000
0.00000
...
0.0
0.00000
0.00000
0.00000
0.00000
0.000000
0.000000
0.00000
0.0
1.0
10 rows × 23 columns
In [4]:
## Now try the same queries, but in python!
import psycopg2
# connect:
dbname='bills_db'
username='Joel'
con = psycopg2.connect(database = dbname, user = username)
# query:
sql_query = """
SELECT DISTINCT(subject) FROM bill_subject;
"""
all_subjects = pd.read_sql_query(sql_query,con)['subject']
all_subjects.head()
Out[4]:
0 House Committee on Armed Services
1 Eritrea
2 Turkey
3 Minnesota
4 Political parties and affiliation
Name: subject, dtype: object
In [85]:
pd.DataFrame(vect.fit_transform(all_subjects).toarray(), columns=vect.get_feature_names())
Out[85]:
abort
abroad
abus
academic
academic performance
academic performance assess
access
access board
accid
accountability
...
world histori
wyom
yemen
york
york st
youth
youth employment
youth employment child
zealand
zimbabw
0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
1
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
2
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
3
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
4
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
6
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
7
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
8
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
9
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
10
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
11
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
12
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
13
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
14
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
15
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
16
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
17
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
18
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
19
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
20
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
21
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.354288
0.354288
0.354288
0.0
0.0
22
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
23
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
24
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
25
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
26
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
27
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
28
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
29
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
972
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
973
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
974
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
975
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
976
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
977
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
978
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
979
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
980
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
1.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
981
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
982
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
983
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
984
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
985
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
986
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
987
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
988
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
1.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
989
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
990
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
991
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
992
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
993
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
994
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
995
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
996
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
997
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
998
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
999
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
1000
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
1001
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.000000
0.000000
0.000000
0.0
0.0
1002 rows × 3271 columns
In [103]:
vect.fit_transform(all_subjects)
Out[103]:
<1002x3122 sparse matrix of type '<type 'numpy.float64'>'
with 4715 stored elements in Compressed Sparse Row format>
In [5]:
# Now let's do the same for the titles
# query:
sql_query = """
SELECT bill_name FROM us_bills;
"""
title_query = pd.read_sql_query(sql_query,con)['bill_name']
In [6]:
title_query.head()
Out[6]:
0 Regarding consent to assemble outside the seat...
1 Recognizing the challenges and burdens associa...
2 Expressing the sense of the Congress regarding...
3 Supporting the Association of American Veterin...
4 Providing for a joint session of Congress to r...
Name: bill_name, dtype: object
In [7]:
test_frame = pd.DataFrame(vect.fit_transform(title_query).toarray(), columns=vect.get_feature_names())
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-7-2e6cc746ee7c> in <module>()
----> 1 test_frame = pd.DataFrame(vect.fit_transform(title_query).toarray(), columns=vect.get_feature_names())
NameError: name 'vect' is not defined
In [111]:
test_frame[test_frame['tax'] > 0]
Out[111]:
00
00 cv
00 cv 03110
000
000 000
000 000 000
000 000 50
000 000 auditor
000 000 cause
000 000 consecutive
...
zone order
zone order help
zone purpose
zone transit
zone transit zone
zone united
zone united state
zoological
zoological veterinary
zoological veterinary medicine
21
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
47
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
80
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
84
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
108
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
151
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
181
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
262
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
292
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
329
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
389
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
396
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
406
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
411
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
412
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
415
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
434
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
450
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
451
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
456
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
459
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
489
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
498
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
501
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
533
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
546
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
577
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
583
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
588
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
593
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
10122
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10147
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10148
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10169
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10202
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10203
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10204
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10207
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10209
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10210
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10215
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10217
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10220
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10221
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10224
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10235
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10239
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10242
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10249
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10251
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10252
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10263
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10264
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10273
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10286
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10293
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10351
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10562
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10575
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
10826
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
...
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
449 rows × 152626 columns
In [112]:
# Check to see if this will work on the full data for U.S. bills
# Now let's do the same for the titles
# query:
sql_query = """
SELECT bill_text FROM us_bills;
"""
bill_query = pd.read_sql_query(sql_query,con)['bill_text']
In [117]:
len(bill_query)
Out[117]:
10933
In [119]:
%time full_text_frame = pd.DataFrame(vect.fit_transform(bill_query).toarray(), columns=vect.get_feature_names())
CPU times: user 20min 50s, sys: 2min 11s, total: 23min 1s
Wall time: 25min 48s
In [ ]:
# Check to see if this will work on the full data for NY bills
In [ ]:
full_text_frame[full_text_frame['tax'] > 0]
In [113]:
Out[113]:
0 {"\n","[Congressional Bills 114th Congress]\n"...
1 {"\n","[Congressional Bills 114th Congress]\n"...
2 {"\n","[Congressional Bills 114th Congress]\n"...
3 {"\n","[Congressional Bills 114th Congress]\n"...
4 {"\n","[Congressional Bills 114th Congress]\n"...
Name: bill_text, dtype: object
In [ ]:
Content source: joelmpiper/bill_taxonomy
Similar notebooks: