In [2]:
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
#In Python: Define a database name (we're using a dataset on births, so I call it 
# birth_db), and your username for your computer (CHANGE IT BELOW). 
dbname = 'bills_db'
username = 'Joel'
## 'engine' is a connection to a database
## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s@localhost/%s'%(username,dbname))
print engine.url


postgres://Joel@localhost/bills_db

In [3]:
## Now try the same queries, but in python!

# connect:
con = None
con = psycopg2.connect(database = dbname, user = username)

# query:
sql_query = """
SELECT * FROM us_bills;
"""
us_bills = pd.read_sql_query(sql_query,con)

us_bills.head()


Out[3]:
bill_num bill_name bill_text top_subject
0 hconres1-114 Regarding consent to assemble outside the seat... {"\n","[Congressional Bills 114th Congress]\n"... Congress
1 hconres10-114 Recognizing the challenges and burdens associa... {"\n","[Congressional Bills 114th Congress]\n"... Education
2 hconres100-114 Expressing the sense of the Congress regarding... {"\n","[Congressional Bills 114th Congress]\n"... International affairs
3 hconres101-114 Supporting the Association of American Veterin... {"\n","[Congressional Bills 114th Congress]\n"... Education
4 hconres102-114 Providing for a joint session of Congress to r... {"\n","[Congressional Bills 114th Congress]\n"... Congress

In [9]:
us_bills.iloc[0]['bill_text']


Out[9]:
'{"\n","[Congressional Bills 114th Congress]\n","[From the U.S. Government Printing Office]\n","[H. Con. Res. 1 Received in Senate (RDS)]\n","\n","114th CONGRESS\n","  1st Session\n","H. CON. RES. 1\n","\n","\n","_______________________________________________________________________\n","\n","\n","                   IN THE SENATE OF THE UNITED STATES\n","\n","                            January 7, 2015\n","\n","                                Received\n","\n","_______________________________________________________________________\n","\n","                         CONCURRENT RESOLUTION\n","\n","\n"," \n","     Regarding consent to assemble outside the seat of government.\n","\n","    Resolved by the House of Representatives (the Senate concurring), \n","That pursuant to clause 4, section 5, article I of the Constitution, \n","during the One Hundred Fourteenth Congress the Speaker of the House and \n","the Majority Leader of the Senate or their respective designees, acting \n","jointly after consultation with the Minority Leader of the House and \n","the Minority Leader of the Senate,\n","\n","\n","              \n","\n","may notify the Members of the House and the Senate, respectively, to \n","assemble at a place outside the District of Columbia if, in their \n","opinion, the public interest shall warrant it.\n","\n","            Passed the House of Representatives January 6, 2015.\n","\n","            Attest:\n","\n","                                                 KAREN L. HAAS,\n","\n","                                                                 Clerk.\n"}'

In [11]:
from os import path
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline


/Users/Joel/anaconda/envs/insight/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [14]:
bill_text = us_bills['bill_text']
bill_text.shape


Out[14]:
(10933,)

In [17]:
words = [bill.split() for bill in bill_text]

In [27]:
import re
refined_words = [re.sub(r'[\W_]+','',s).lower() for word in words for s in word]

In [38]:
no_empties = [x for x in refined_words if len(x) > 3]

In [41]:
len(no_empties)


Out[41]:
13009999

In [46]:
wordcloud = WordCloud().generate(reduce(lambda a, b: a + " " + b, no_empties[0:100000]))

In [47]:
plt.imshow(wordcloud)
plt.axis("off")


Out[47]:
(-0.5, 399.5, 199.5, -0.5)

In [48]:
wordcloud = WordCloud(max_font_size=40, relative_scaling=.5).generate(reduce(lambda a,b: a+" "+b,no_empties[0:100000]))
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()



In [ ]:


In [49]:
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
vect = CountVectorizer()
vect.fit(bill_text)
vect.get_feature_names()


Out[50]:
[u'00',
 u'000',
 u'0000',
 u'000005042',
 u'00007',
 u'0001',
 u'0002',
 u'0003',
 u'00034',
 u'0005',
 u'000in',
 u'000mi',
 u'000th',
 u'001',
 u'00168',
 u'002',
 u'00200',
 u'0025',
 u'0028',
 u'002a',
 u'003',
 u'0030',
 u'00315',
 u'0035',
 u'0039',
 u'004',
 u'00436',
 u'005',
 u'00506',
 u'00549',
 u'0056',
 u'006',
 u'0060',
 u'0067',
 u'007',
 u'008',
 u'0080',
 u'00801',
 u'0081',
 u'0082',
 u'0083',
 u'009',
 u'0090',
 u'01',
 u'010',
 u'0100',
 u'0101',
 u'0101113f',
 u'0101122f',
 u'0101125f',
 u'0101126f',
 u'0101127f',
 u'0101213f',
 u'0101221n',
 u'0101224n',
 u'0101226n',
 u'0101313f',
 u'0101314f',
 u'0101316f',
 u'0101402n',
 u'0102110f',
 u'0102326f',
 u'0105921f',
 u'0106',
 u'01080',
 u'011',
 u'0117',
 u'012',
 u'01247',
 u'012a',
 u'013',
 u'01367',
 u'013b',
 u'014',
 u'014a',
 u'015',
 u'015a',
 u'016',
 u'016a',
 u'017',
 u'0172',
 u'0178',
 u'018',
 u'01832',
 u'0189',
 u'019',
 u'019a',
 u'01b',
 u'01h',
 u'01rv14136',
 u'02',
 u'020',
 u'0202',
 u'0202429a',
 u'0203345d8z',
 u'0203728a',
 u'0203735a',
 u'0203740a',
 u'0203744a',
 u'0203752a',
 u'0203758a',
 u'0203761n',
 u'0203801a',
 u'0203802a',
 u'0203808a',
 u'0204136n',
 u'0204163n',
 u'0204202n',
 u'0204228n',
 u'0204229n',
 u'0204311n',
 u'0204413n',
 u'0204460m',
 u'0204571j',
 u'0204571n',
 u'0204574n',
 u'0204575n',
 u'0205219f',
 u'0205402a',
 u'0205410a',
 u'0205456a',
 u'0205601n',
 u'0205604n',
 u'0205620n',
 u'0205632n',
 u'0205633n',
 u'0205671f',
 u'0205675n',
 u'0205778a',
 u'0206313m',
 u'0206335m',
 u'0206623m',
 u'0206624m',
 u'0206625m',
 u'0206629m',
 u'0207110f',
 u'0207131f',
 u'0207133f',
 u'0207134f',
 u'0207136f',
 u'0207138f',
 u'0207142f',
 u'0207161f',
 u'0207161n',
 u'0207163f',
 u'0207163n',
 u'0207171f',
 u'0207224f',
 u'0207227f',
 u'0207247f',
 u'0207249f',
 u'0207253f',
 u'0207268f',
 u'0207325f',
 u'0207410f',
 u'0207412f',
 u'0207417f',
 u'0207418f',
 u'0207431f',
 u'0207444f',
 u'0207448f',
 u'0207452f',
 u'0207455f',
 u'0207590f',
 u'0207601f',
 u'0207605f',
 u'0207697f',
 u'0207701f',
 u'0208006f',
 u'0208043j',
 u'0208045k',
 u'0208053a',
 u'0208087f',
 u'0208088f',
 u'021',
 u'0210609a',
 u'0211',
 u'0219902m',
 u'022',
 u'023',
 u'024',
 u'025',
 u'025a',
 u'026',
 u'027',
 u'028',
 u'028a',
 u'028b',
 u'029',
 u'03',
 u'030',
 u'03005',
 u'0301017f',
 u'0301112f',
 u'0301144k',
 u'0301400f',
 u'0302015f',
 u'0302016k',
 u'0302019k',
 u'0303001f',
 u'0303028a',
 u'0303032a',
 u'0303109n',
 u'0303126k',
 u'0303131f',
 u'0303131k',
 u'0303135g',
 u'0303136g',
 u'0303138n',
 u'0303140a',
 u'0303140d8z',
 u'0303140f',
 u'0303140g',
 u'0303140n',
 u'0303141a',
 u'0303141f',
 u'0303141k',
 u'0303142a',
 u'0303142f',
 u'0303150a',
 u'0303150k',
 u'0303150m',
 u'0303153k',
 u'0303166j',
 u'0303170k',
 u'0303191d8z',
 u'0303228k',
 u'0303260a',
 u'0303260d8z',
 u'0303310d8z',
 u'0303354n',
 u'0303430k',
 u'0303610k',
 u'0304210bb',
 u'0304231n',
 u'0304260f',
 u'0304270a',
 u'0304270n',
 u'0304785n',
 u'0305099f',
 u'0305103c',
 u'0305103k',
 u'0305110f',
 u'0305111f',
 u'0305114f',
 u'0305116f',
 u'0305124n',
 u'0305128f',
 u'0305145f',
 u'0305146f',
 u'0305160f',
 u'0305160n',
 u'0305164f',
 u'0305172k',
 u'0305173f',
 u'0305174f',
 u'0305176f',
 u'0305179a',
 u'0305179f',
 u'0305182f',
 u'0305186d8z',
 u'0305192n',
 u'0305193d8z',
 u'0305199d8z',
 u'0305202f',
 u'0305204a',
 u'0305204n',
 u'0305205n',
 u'0305206a',
 u'0305206f',
 u'0305207f',
 u'0305208a',
 u'0305208bb',
 u'0305208f',
 u'0305208k',
 u'0305208m',
 u'0305208n',
 u'0305219a',
 u'0305219f',
 u'0305220f',
 u'0305220n',
 u'0305221f',
 u'0305231n',
 u'0305232a',
 u'0305232m',
 u'0305233a',
 u'0305233n',
 u'0305234n',
 u'0305236f',
 u'0305238f',
 u'0305239m',
 u'0305240f',
 u'0305241n',
 u'0305242m',
 u'0305251a',
 u'0305258f',
 u'0305265f',
 u'0305304d8z',
 u'0305327v',
 u'0305387d8z',
 u'0305421n',
 u'0305600f',
 u'0305614f',
 u'0305881f',
 u'0305906f',
 u'0305913f',
 u'0305940f',
 u'0306250f',
 u'0306250m',
 u'0306415f',
 u'0307577d8z',
 u'0307581f',
 u'0307665a',
 u'0308601n',
 u'0308602f',
 u'0308699f',
 u'030a',
 u'031',
 u'0310349a',
 u'03110',
 u'032',
 u'03232',
 u'0327',
 u'033',
 u'034',
 u'0340',
 u'034a',
 u'035',
 u'036',
 u'03600',
 u'036a',
 u'037',
 u'037a',
 u'038',
 u'039',
 u'0390',
 u'0391',
 u'0396',
 u'039a',
 u'04',
 u'040',
 u'0401115f',
 u'0401119f',
 u'0401130f',
 u'0401132f',
 u'0401134f',
 u'0401219f',
 u'0401314f',
 u'0401318f',
 u'0401319f',
 u'0408011f',
 u'040a',
 u'041',
 u'042',
 u'043',
 u'0433',
 u'044',
 u'0447',
 u'045',
 u'046',
 u'047',
 u'048',
 u'049',
 u'0495',
 u'04b',
 u'05',
 u'050',
 u'0505',
 u'051',
 u'052',
 u'053',
 u'054',
 u'055',
 u'0554',
 u'056',
 u'0560',
 u'0564',
 u'057',
 u'058',
 u'059',
 u'0593215',
 u'06',
 u'060',
 u'0601000br',
 u'0601101a',
 u'0601101e',
 u'0601102a',
 u'0601102f',
 u'0601103a',
 u'0601103f',
 u'0601103n',
 u'0601104a',
 u'0601108f',
 u'0601110d8z',
 u'0601117e',
 u'0601120d8z',
 u'0601152n',
 u'0601153n',
 u'0601228d8z',
 u'0601384bp',
 u'0602',
 u'0602000d8z',
 u'0602102f',
 u'0602105a',
 u'0602114n',
 u'0602115e',
 u'0602120a',
 u'0602122a',
 u'0602123n',
 u'0602131m',
 u'0602201f',
 u'0602202f',
 u'0602203f',
 u'0602204f',
 u'0602211a',
 u'0602230d8z',
 u'0602234d8z',
 u'0602235n',
 u'0602236n',
 u'0602251d8z',
 u'0602270a',
 u'0602271n',
 u'0602303a',
 u'0602303e',
 u'0602307a',
 u'0602308a',
 u'0602383e',
 u'0602384bp',
 u'0602435n',
 u'0602601a',
 u'0602601f',
 u'0602602f',
 u'0602605f',
 u'0602618a',
 u'0602622a',
 u'0602623a',
 u'0602624a',
 u'0602651m',
 u'0602668d8z',
 u'0602702e',
 u'0602705a',
 u'0602709a',
 u'0602712a',
 u'0602715e',
 u'0602716a',
 u'0602716e',
 u'0602718br',
 u'0602720a',
 u'0602747n',
 u'0602750n',
 u'0602751d8z',
 u'0602782a',
 u'0602782n',
 u'0602783a',
 u'0602784a',
 u'0602785a',
 u'0602786a',
 u'0602787a',
 u'0602788f',
 u'0602890f',
 u'0602898n',
 u'0603',
 u'0603000d8z',
 u'0603001a',
 u'0603002a',
 u'0603003a',
 u'0603004a',
 u'0603005a',
 u'0603006a',
 u'0603007a',
 u'0603009a',
 u'0603015a',
 u'0603020a',
 u'0603112f',
 u'0603114n',
 u'0603122d8z',
 u'0603123n',
 u'0603125a',
 u'0603130a',
 u'0603131a',
 u'0603133d8z',
 u'0603160br',
 u'0603161d8z',
 u'0603176c',
 u'0603177c',
 u'0603178c',
 u'0603179c',
 u'0603180c',
 u'0603199f',
 u'0603203f',
 u'0603207n',
 u'0603208n',
 u'0603211f',
 u'0603216f',
 u'0603216n',
 u'0603225d8z',
 u'0603237n',
 u'0603251n',
 u'0603254n',
 u'0603260f',
 u'0603261n',
 u'0603264s',
 u'0603270a',
 u'0603270f',
 u'0603271n',
 u'0603274c',
 u'0603286e',
 u'0603287e',
 u'0603288d8z',
 u'0603289d8z',
 u'0603294c',
 u'0603305a',
 u'0603308a',
 u'0603313a',
 u'0603322a',
 u'0603375d8z',
 u'0603382n',
 u'0603384bp',
 u'0603401f',
 u'0603423f',
 u'0603438f',
 u'0603444f',
 u'0603456f',
 u'0603461a',
 u'0603502n',
 u'0603506n',
 u'0603512n',
 u'0603525n',
 u'0603527d8z',
 u'0603527n',
 u'0603536n',
 u'0603542n',
 u'0603553n',
 u'0603561n',
 u'0603562n',
 u'0603563n',
 u'0603564n',
 u'0603570n',
 u'0603573n',
 u'0603576n',
 u'0603581n',
 u'0603582n',
 u'0603595n',
 u'0603596n',
 u'0603597n',
 u'0603599n',
 u'0603600d8z',
 u'0603601f',
 u'0603605f',
 u'0603606a',
 u'0603607a',
 u'0603609n',
 u'0603611m',
 u'0603618d8z',
 u'0603619a',
 u'0603627a',
 u'0603635m',
 u'0603639a',
 u'0603640m',
 u'0603648d8z',
 u'0603651m',
 u'0603654n',
 u'0603658n',
 u'0603662d8z',
 u'0603673n',
 u'0603680d8z',
 u'0603680f',
 u'0603680n',
 u'0603680s',
 u'0603699d8z',
 u'0603710a',
 u'0603712s',
 u'0603713n',
 u'0603713s',
 u'0603714d8z',
 u'0603716d8z',
 u'0603720s',
 u'0603721n',
 u'0603724n',
 u'0603725n',
 u'0603727d8z',
 u'0603728a',
 u'0603729n',
 u'0603734a',
 u'0603734n',
 u'0603739e',
 u'0603739n',
 u'0603742f',
 u'0603746n',
 u'0603747a',
 u'0603747n',
 u'0603748n',
 u'0603751n',
 u'0603758n',
 u'0603760e',
 u'0603764n',
 u'0603766a',
 u'0603766e',
 u'0603767e',
 u'0603769se',
 u'0603772a',
 u'0603774a',
 u'0603778a',
 u'0603779a',
 u'0603781d8z',
 u'0603782n',
 u'0603787n',
 u'0603788f',
 u'0603790a',
 u'0603790f',
 u'0603790n',
 u'0603794a',
 u'0603795n',
 u'0603801a',
 u'0603804a',
 u'0603807a',
 u'0603813a',
 u'0603821d8z',
 u'0603826d8z',
 u'0603827a',
 u'0603830f',
 u'0603833d8z',
 u'0603851d8z',
 u'0603851f',
 u'0603851m',
 u'0603860n',
 u'0603881c',
 u'0603882c',
 u'0603884bp',
 u'0603884c',
 u'0603890c',
 u'0603891c',
 u'0603892c',
 u'0603893c',
 u'0603895c',
 u'0603896c',
 u'0603898c',
 u'0603904c',
 u'0603906c',
 u'0603907c',
 u'0603913c',
 u'0603914c',
 u'0603915c',
 u'0603920d8z',
 u'0603923d8z',
 u'0603925n',
 u'0603941d8z',
 u'0603xxxc',
 u'0603xxxx',
 u'0604015f',
 u'0604016d8z',
 u'0604055d8z',
 u'0604100a',
 u'0604112n',
 u'0604114a',
 u'0604115a',
 u'0604115c',
 u'0604120a',
 u'0604122n',
 u'0604130v',
 u'0604132d8z',
 u'0604161d8z',
 u'0604165d8z',
 u'0604201a',
 u'0604212n',
 u'0604214n',
 u'0604215n',
 u'0604216n',
 u'0604218n',
 u'0604221n',
 u'0604230n',
 u'0604231n',
 u'0604233f',
 u'0604234n',
 u'0604245n',
 u'0604250d8z',
 u'0604256a',
 u'0604256f',
 u'0604256n',
 u'0604257f',
 u'0604258a',
 u'0604258n',
 u'0604261n',
 u'0604262n',
 u'0604264n',
 u'0604269n',
 u'0604270a',
 u'0604270f',
 u'0604270n',
 u'0604272n',
 u'0604273n',
 u'0604274n',
 u'0604279n',
 u'0604280a',
 u'0604280n',
 u'0604281f',
 u'0604282n',
 u'0604287f',
 u'0604290a',
 u'0604292n',
 u'0604307n',
 u'0604311n',
 u'0604317f',
 u'0604319a',
 u'0604321a',
 u'0604327f',
 u'0604328a',
 u'0604329f',
 u'0604329n',
 u'0604366n',
 u'0604373n',
 u'0604376m',
 u'0604378n',
 u'0604384bp',
 u'0604400d8z',
 u'0604404n',
 u'0604421f',
 u'0604422f',
 u'0604425f',
 u'0604426f',
 u'0604429f',
 u'0604441f',
 u'0604445f',
 u'0604454n',
 u'0604501n',
 u'0604503n',
 u'0604504n',
 u'0604512n',
 u'0604518n',
 u'0604522n',
 u'0604536n',
 u'0604558n',
 u'0604562n',
 u'0604567n',
 u'0604574n',
 u'0604580n',
 u'0604601a',
 u'0604601n',
 u'0604602f',
 u'0604604f',
 u'0604610n',
 u'0604611a',
 u'0604617f',
 u'0604618f',
 u'0604622a',
 u'0604633a',
 u'0604641a',
 u'0604642a',
 u'0604645a',
 u'0604653n',
 u'0604654n',
 u'0604659n',
 u'0604682d8z',
 u'0604703n',
 u'0604706f',
 u'0604707n',
 u'0604710a',
 u'0604713a',
 u'0604715a',
 u'0604727n',
 u'0604735f',
 u'0604741a',
 u'0604742a',
 u'0604746a',
 u'0604755n',
 u'0604756n',
 u'0604757n',
 u'0604759a',
 u'0604759f',
 u'0604759n',
 u'0604760a',
 u'0604761n',
 u'0604764k',
 u'0604771d8z',
 u'0604771n',
 u'0604774d8z',
 u'0604776f',
 u'0604777n',
 u'0604780a',
 u'0604786n',
 u'0604798a',
 u'0604800f',
 u'0604800m',
 u'0604800n',
 u'0604802a',
 u'0604804a',
 u'0604805a',
 u'0604807a',
 u'0604808a',
 u'0604810m',
 u'0604810n',
 u'0604818a',
 u'0604820a',
 u'0604822a',
 u'0604823a',
 u'0604826j',
 u'0604827a',
 u'0604853f',
 u'0604854a',
 u'0604857f',
 u'0604858f',
 u'0604873c',
 u'0604874c',
 u'0604875d8z',
 u'0604876c',
 u'0604878c',
 u'0604879c',
 u'0604880c',
 u'0604881c',
 u'0604887c',
 u'0604894c',
 u'0604932f',
 u'0604933f',
 u'0604940d8z',
 u'0604942d8z',
 u'0604xxxd',
 u'0604xxxf',
 u'0605000br',
 u'0605001e',
 u'0605013a',
 u'0605013bl',
 u'0605013m',
 u'0605013n',
 u'0605018a',
 u'0605018f',
 u'0605021se',
 u'0605022d8z',
 u'0605024a',
 u'0605024f',
 u'0605024n',
 u'0605027d8z',
 u'0605028a',
 u'0605029a',
 u'0605030a',
 u'0605030f',
 u'0605031a',
 u'0605032a',
 u'0605033a',
 u'0605034a',
 u'0605035a',
 u'0605036a',
 u'0605041a',
 u'0605042a',
 u'0605047a',
 u'0605051a',
 u'0605052a',
 u'0605070s',
 u'0605075d8z',
 u'0605080s',
 u'0605090s',
 u'0605100d8z',
 u'0605101f',
 u'0605103a',
 u'0605104d8z',
 u'0605117f',
 u'0605118ote',
 u'0605126j',
 u'0605126n',
 u'0605127t',
 u'0605131ote',
 u'0605140d8z',
 u'0605142d8z',
 u'0605147t',
 u'0605151d8z',
 u'0605152n',
 u'0605154n',
 u'0605161d8z',
 u'0605170d8z',
 u'0605200d8z',
 u'0605210d8z',
 u'0605212n',
 u'0605213f',
 u'0605214f',
 u'0605215n',
 u'0605217n',
 u'0605220n',
 u'0605221f',
 u'0605223f',
 u'0605229f',
 u'0605230f',
 u'0605278f',
 u'0605285n',
 u'0605301a',
 u'0605326a',
 u'0605327n',
 u'0605350a',
 u'0605380a',
 u'0605384bp',
 u'0605414n',
 u'0605431f',
 u'0605432f',
 u'0605433f',
 u'0605450a',
 u'0605450n',
 u'0605456a',
 u'0605457a',
 u'0605458f',
 u'0605500n',
 u'0605504n',
 u'0605601a',
 u'0605602a',
 u'0605604a',
 u'0605606a',
 u'0605625a',
 u'0605626a',
 u'0605702a',
 u'0605706a',
 u'0605709a',
 u'0605712a',
 u'0605712f',
 u'0605716a',
 u'0605718a',
 u'0605766a',
 u'0605790d8z',
 u'0605798d8z',
 u'0605801a',
 u'0605801ka',
 u'0605803a',
 u'0605803se',
 u'0605804d8z',
 u'0605804n',
 u'0605805a',
 u'0605807f',
 u'0605812a',
 u'0605812m',
 u'0605814ote',
 u'0605830a',
 u'0605853n',
 u'0605856n',
 u'0605857a',
 u'0605860f',
 u'0605861n',
 u'0605863n',
 u'0605864f',
 u'0605864n',
 u'0605865n',
 u'0605866n',
 u'0605867n',
 u'0605873m',
 u'0605898a',
 u'0605898e',
 u'0605898n',
 u'0605931f',
 u'0605976f',
 u'0605978f',
 u'0605998ka',
 u'0606017f',
 u'0606100d8z',
 u'0606116f',
 u'0606355n',
 u'0606392f',
 u'0607131a',
 u'0607133a',
 u'0607134a',
 u'0607135a',
 u'0607136a',
 u'0607137a',
 u'0607138a',
 u'0607139a',
 u'0607140a',
 u'0607141a',
 u'0607210d8z',
 u'0607310d8z',
 u'0607327t',
 u'0607384bp',
 u'0607658n',
 u'0607665a',
 u'0607700n',
 u'0607865a',
 u'061',
 u'062',
 u'0621',
 u'0625',
 u'063',
 u'06371',
 u'064',
 u'0648',
 u'065',
 u'066',
 u'067',
 u'068',
 u'069',
 u'0699',
 u'06cv2239tfh',
 u'07',
 u'070',
 u'0701212f',
 u'0702207f',
 u'0702207n',
 u'0702806f',
 u'0707',
 u'0708011s',
 u'0708012s',
 u'0708045a',
 u'0708047s',
 u'0708610f',
 u'0708611f',
 u'0708730n',
 u'070a',
 u'071',
 ...]

In [52]:
simple_train_dtm = vect.transform(bill_text)
simple_train_dtm


Out[52]:
<10933x55476 sparse matrix of type '<type 'numpy.int64'>'
	with 3754520 stored elements in Compressed Sparse Row format>

In [54]:
us_bills['international_affairs']=0

In [103]:
us_bills['marine_and_inland_water_transportation'] = 0

In [104]:
us_bills.head()


Out[104]:
bill_num bill_name bill_text top_subject international_affairs marine_and_inland_water_transportation
0 hconres1-114 Regarding consent to assemble outside the seat... {"\n","[Congressional Bills 114th Congress]\n"... Congress 0 0
1 hconres10-114 Recognizing the challenges and burdens associa... {"\n","[Congressional Bills 114th Congress]\n"... Education 0 0
2 hconres100-114 Expressing the sense of the Congress regarding... {"\n","[Congressional Bills 114th Congress]\n"... International affairs 0 0
3 hconres101-114 Supporting the Association of American Veterin... {"\n","[Congressional Bills 114th Congress]\n"... Education 0 0
4 hconres102-114 Providing for a joint session of Congress to r... {"\n","[Congressional Bills 114th Congress]\n"... Congress 0 0

In [94]:
# query:
sql_query = """
SELECT bill_num, subject FROM bill_subject WHERE subject='International affairs' OR subject='International Affairs';
"""
international_terms = pd.read_sql_query(sql_query,con)

In [100]:
# query:
sql_query = """
SELECT bill_num, subject FROM bill_subject WHERE subject='Marine and inland water transportation';
"""
marine_terms = pd.read_sql_query(sql_query,con)

In [101]:
marine_terms


Out[101]:
bill_num subject
0 hr1056-114 Marine and inland water transportation
1 hr1135-114 Marine and inland water transportation
2 hr1248-114 Marine and inland water transportation
3 hr1288-114 Marine and inland water transportation
4 hr1308-114 Marine and inland water transportation
5 hr142-114 Marine and inland water transportation
6 hr1540-114 Marine and inland water transportation
7 hr1665-114 Marine and inland water transportation
8 hr1735-114 Marine and inland water transportation
9 hr1804-114 Marine and inland water transportation
10 hr1823-114 Marine and inland water transportation
11 hr1900-114 Marine and inland water transportation
12 hr198-114 Marine and inland water transportation
13 hr1987-114 Marine and inland water transportation
14 hr2028-114 Marine and inland water transportation
15 hr2029-114 Marine and inland water transportation
16 hr2048-114 Marine and inland water transportation
17 hr22-114 Marine and inland water transportation
18 hr229-114 Marine and inland water transportation
19 hr240-114 Marine and inland water transportation
20 hr2410-114 Marine and inland water transportation
21 hr2485-114 Marine and inland water transportation
22 hr2534-114 Marine and inland water transportation
23 hr2577-114 Marine and inland water transportation
24 hr2700-114 Marine and inland water transportation
25 hr2876-114 Marine and inland water transportation
26 hr2923-114 Marine and inland water transportation
27 hr2992-114 Marine and inland water transportation
28 hr3064-114 Marine and inland water transportation
29 hr3142-114 Marine and inland water transportation
... ... ...
107 s2130-114 Marine and inland water transportation
108 s2206-114 Marine and inland water transportation
109 s2328-114 Marine and inland water transportation
110 s2378-114 Marine and inland water transportation
111 s2635-114 Marine and inland water transportation
112 s268-114 Marine and inland water transportation
113 s272-114 Marine and inland water transportation
114 s2726-114 Marine and inland water transportation
115 s2829-114 Marine and inland water transportation
116 s2844-114 Marine and inland water transportation
117 s2848-114 Marine and inland water transportation
118 s2865-114 Marine and inland water transportation
119 s2989-114 Marine and inland water transportation
120 s3001-114 Marine and inland water transportation
121 s33-114 Marine and inland water transportation
122 s371-114 Marine and inland water transportation
123 s373-114 Marine and inland water transportation
124 s515-114 Marine and inland water transportation
125 s525-114 Marine and inland water transportation
126 s589-114 Marine and inland water transportation
127 s764-114 Marine and inland water transportation
128 s825-114 Marine and inland water transportation
129 s834-114 Marine and inland water transportation
130 s859-114 Marine and inland water transportation
131 sconres11-114 Marine and inland water transportation
132 sres153-114 Marine and inland water transportation
133 sres291-114 Marine and inland water transportation
134 sres332-114 Marine and inland water transportation
135 sres359-114 Marine and inland water transportation
136 sres370-114 Marine and inland water transportation

137 rows × 2 columns


In [112]:
us_bills.ix[us_bills['bill_num'].isin(marine_terms['bill_num']), 'marine_and_inland_water_transportation'] = 1

In [116]:
X = us_bills['bill_text']
y = us_bills['marine_and_inland_water_transportation']

In [117]:
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [118]:
# import and instantiate the vectorizer
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

In [119]:
# fit and transform X_train, but only transform X_test
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

In [120]:
# import/instantiate/fit
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)


Out[120]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [121]:
# make class predictions
y_pred_class = nb.predict(X_test_dtm)

In [122]:
# calculate accuracy
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_class)


0.966349670812

In [124]:
# predict class probabilities
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]

In [143]:
y_test[y_test == 1]


Out[143]:
5850    1
7602    1
7454    1
8454    1
2954    1
8211    1
8082    1
1443    1
8110    1
3027    1
7816    1
7920    1
9802    1
7709    1
916     1
7326    1
4852    1
9797    1
3432    1
443     1
6484    1
1206    1
6238    1
5493    1
7853    1
6504    1
4269    1
1859    1
1984    1
3701    1
642     1
7555    1
8956    1
3449    1
8999    1
Name: marine_and_inland_water_transportation, dtype: int64

In [126]:
# calculate the AUC using y_test_binary and y_pred_prob
print metrics.roc_auc_score(y_test, y_pred_prob)


0.799719472821

In [127]:
%matplotlib inline
import matplotlib.pyplot as plt

In [129]:
# plot ROC curve using y_test_binary and y_pred_prob
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')


Out[129]:
<matplotlib.text.Text at 0x237fd8d10>

In [130]:
print metrics.confusion_matrix(y_test, y_pred_class)


[[2622   77]
 [  15   20]]

In [131]:
# first 10 false positives (meaning they were incorrectly classified as 5-star reviews)
X_test[y_test < y_pred_class][:10]


Out[131]:
9574    {"\n","[Congressional Bills 114th Congress]\n"...
1777    {"\n","[Congressional Bills 114th Congress]\n"...
2106    {"\n","[Congressional Bills 114th Congress]\n"...
941     {"\n","[Congressional Bills 114th Congress]\n"...
8119    {"\n","[Congressional Bills 114th Congress]\n"...
9110    {"\n","[Congressional Bills 114th Congress]\n"...
4803    {"\n","[Congressional Bills 114th Congress]\n"...
7267    {"\n","[Congressional Bills 114th Congress]\n"...
4467    {"\n","[Congressional Bills 114th Congress]\n"...
9248    {"\n","[Congressional Bills 114th Congress]\n"...
Name: bill_text, dtype: object

In [139]:
len(X_test[y_test < y_pred_class])


Out[139]:
77

In [132]:
X_test[9574]


Out[132]:
'{"\n","[Congressional Bills 114th Congress]\n","[From the U.S. Government Printing Office]\n","[S. 327 Introduced in Senate (IS)]\n","\n","114th CONGRESS\n","  1st Session\n","                                 S. 327\n","\n","  To provide for auditable financial statements for the Department of \n","                    Defense, and for other purposes.\n","\n","\n","_______________________________________________________________________\n","\n","\n","                   IN THE SENATE OF THE UNITED STATES\n","\n","                            February 2, 2015\n","\n","Mr. Manchin (for himself, Mr. Paul, Mr. Wyden, and Mr. Cruz) introduced \n","the following bill; which was read twice and referred to the Committee \n","                           on Armed Services\n","\n","_______________________________________________________________________\n","\n","                                 A BILL\n","\n","\n"," \n","  To provide for auditable financial statements for the Department of \n","                    Defense, and for other purposes.\n","\n","    Be it enacted by the Senate and House of Representatives of the \n","United States of America in Congress assembled,\n","\n","SECTION 1. SHORT TITLE.\n","\n","    This Act may be cited as the ``Audit the Pentagon Act of 2015\'\'.\n","\n","SEC. 2. FINDINGS.\n","\n","    Congress makes the following findings:\n","            (1) Section 9 of Article I of the Constitution of the \n","        United States requires all agencies of the Federal Government, \n","        including the Department of Defense, to publish ``a regular \n","        statement and account of the receipts and expenditures of all \n","        public money\'\'.\n","            (2) Section 3515 of title 31, United States Code, requires \n","        the agencies of the Federal Government, including the \n","        Department of Defense, to present auditable financial \n","        statements beginning not later than March 1, 1997. The \n","        Department has not complied with this law.\n","            (3) The Federal Financial Management Improvement Act of \n","        1996 (31 U.S.C. 3512 note) requires financial systems acquired \n","        by the Federal Government, including the Department of Defense, \n","        to be able to provide information to leaders to manage and \n","        control the cost of Government. The Department has not complied \n","        with this law.\n","            (4) The financial management of the Department of Defense \n","        has been on the ``High-Risk\'\' list of the Government \n","        Accountability Office, which means that the Department is not \n","        consistently able to ``control costs; ensure basic \n","        accountability; anticipate future costs and claims on the \n","        budget; measure performance; maintain funds control; [and] \n","        prevent and detect fraud, waste, and abuse\'\'.\n","            (5) The National Defense Authorization Act for Fiscal Year \n","        2002 (Public Law 107-107) requires the Secretary of Defense to \n","        report to Congress annually on the reliability of the financial \n","        statements of the Department of Defense, to minimize resources \n","        spent on producing unreliable financial statements, and to use \n","        resources saved to improve financial management policies, \n","        procedures, and internal controls.\n","            (6) In 2005, the Department of Defense created a Financial \n","        Improvement and Audit Readiness (FIAR) Plan, overseen by a \n","        directorate within the office of the Under Secretary of Defense \n","        (Comptroller), to improve Department business processes with \n","        the goal of producing timely, reliable, and accurate financial \n","        information that could generate an audit-ready annual financial \n","        statement. In December 2005, that directorate, known as the \n","        FIAR Directorate, issued the first of a series of semiannual \n","        reports on the status of the Financial Improvement and Audit \n","        Readiness Plan.\n","            (7) The National Defense Authorization Act for Fiscal Year \n","        2010 (Public Law 111-84) requires regular status reports on the \n","        Financial Improvement and Audit Readiness Plan described in \n","        paragraph (6), and codified as a statutory requirement the goal \n","        of the Plan in ensuring that Department of Defense financial \n","        statements are validated as ready for audit not later than \n","        September 30, 2017. In addition, the National Defense \n","        Authorization Act for Fiscal Year 2013 (Public Law 112-239) \n","        requires that the statement of budgetary resources of the \n","        Department of Defense be validated as ready for audit by not \n","        later than September 30, 2014.\n","            (8) At a September 2010 hearing of the Senate, the \n","        Government Accountability Office stated that past expenditures \n","        by the Department of Defense of $5,800,000,000 to improve \n","        financial information, and billions of dollars more of \n","        anticipated expenditures on new information technology systems \n","        for that purpose, may not suffice to achieve full audit \n","        readiness of the financial statement of the Department. At that \n","        hearing, the Government Accountability Office could not predict \n","        when the Department would achieve full audit readiness of such \n","        statements.\n","            (9) At a 2013 hearing of the Senate, Secretary of Defense \n","        Chuck Hagel affirmed his commitment to audit-ready budget \n","        statements for the Department of Defense by the end of 2014, \n","        and stated that he ``will do everything he can to fulfill this \n","        commitment\'\'. At that hearing, Secretary Hagel noted that \n","        auditable financial statements were essential to the Department \n","        not only for improving the quality of its financial \n","        information, but also for reassuring the public and Congress \n","        that it is a good steward of public funds.\n","\n","SEC. 3. CESSATION OF APPLICABILITY OF REPORTING REQUIREMENTS REGARDING \n","              THE FINANCIAL STATEMENTS OF THE DEPARTMENT OF DEFENSE.\n","\n","    (a) Cessation of Applicability.--\n","            (1) Military departments.--The financial statements of a \n","        military department shall cease to be covered by the reporting \n","        requirements specified in subsection (b) upon the issuance of \n","        an unqualified audit opinion on such financial statements.\n","            (2) Department of defense.--The reporting requirements \n","        specified in subsection (b) shall cease to be effective when an \n","        unqualified audit opinion is issued on the financial statements \n","        of the Department of Defense, including each of the military \n","        departments and the other reporting entities defined by the \n","        Office of Management and Budget.\n","    (b) Reporting Requirements.--The reporting requirements specified \n","in this subsection are the following:\n","            (1) The requirement for annual reports in section 892(b) of \n","        the Ike Skelton National Defense Authorization Act for Fiscal \n","        Year 2011 (Public Law 111-383; 124 Stat. 4311; 10 U.S.C. 2306a \n","        note).\n","            (2) The requirement for semi-annual reports in section \n","        1003(b) of the National Defense Authorization Act for Fiscal \n","        Year 2010 (Public Law 111-84; 123 Stat. 2440; 10 U.S.C. 2222 \n","        note).\n","            (3) The requirement for annual reports in section 817(d) of \n","        the Bob Stump National Defense Authorization Act for Fiscal \n","        Year 2003 (10 U.S.C. 2306a note).\n","            (4) The requirement for annual reports in section 1008(a) \n","        of the National Defense Authorization Act for Fiscal Year 2002 \n","        (Public Law 107-107; 115 Stat. 1204; 10 U.S.C. 113 note).\n","            (5) The requirement for periodic reports in section 908(b) \n","        of the Defense Acquisition Improvement Act of 1986 (Public Law \n","        99-500; 100 Stat. 1783-140; 10 U.S.C. 2326 note) and duplicate \n","        requirements as provided for in section 6 of the Defense \n","        Technical Corrections Act of 1987 (Public Law 100-26; 101 Stat. \n","        274; 10 U.S.C. 2302 note).\n","\n","SEC. 4. ENHANCED REPROGRAMMING AUTHORITY FOLLOWING ACHIEVEMENT BY \n","              DEPARTMENT OF DEFENSE AND MILITARY DEPARTMENTS OF AUDIT \n","              WITH UNQUALIFIED OPINION OF STATEMENT OF BUDGETARY \n","              RESOURCES FOR FISCAL YEARS AFTER FISCAL YEAR 2015.\n","\n","    (a) Department of Defense Generally.--Subject to section 6(1), if \n","the Department of Defense obtains an audit with an unqualified opinion \n","on its statement of budgetary resources for any fiscal year after \n","fiscal year 2015, the limitation on the total amount of authorizations \n","that the Secretary of Defense may transfer pursuant to general transfer \n","authority available to the Secretary in the national interest in the \n","succeeding fiscal year shall be $8,000,000,000.\n","    (b) Military Departments, Defense Agencies, and Defense Field \n","Activities.--Subject to section 7(a), if a military department, Defense \n","Agency, or defense field activity obtains an audit with an unqualified \n","opinion on its statement of budgetary resources for any fiscal year \n","after fiscal year 2015, the thresholds for reprogramming of funds of \n","such military department, Defense Agency, or defense field activity, as \n","the case may be, without prior notice to Congress for the succeeding \n","fiscal year shall be deemed to be the thresholds as follows:\n","            (1) In the case of an increase or decrease to the program \n","        base amount for a procurement program, $60,000,000.\n","            (2) In the case of an increase or decrease to the program \n","        base amount for a research program, $30,000,000.\n","            (3) In the case of an increase or decrease to the amount \n","        for a budget activity for operation and maintenance, \n","        $45,000,000.\n","            (4) In the case of an increase or decrease to the amount \n","        for a budget activity for military personnel, $30,000,000.\n","    (c) Construction.--Nothing in this section shall be construed to \n","alter or revise any requirement (other than a threshold amount) for \n","notice to Congress on transfers covered by subsection (a) or \n","reprogrammings covered by subsection (b) under any other provision of \n","law.\n","    (d) Definitions.--In this section, the terms ``program base \n","amount\'\', ``procurement program\'\', ``research program\'\', and ``budget \n","activity\'\' have the meanings given such terms in chapter 6 of volume 3 \n","of the Financial Management Regulation of the Department of Defense \n","(DoD 7000.14R), dated March 2011, or any successor document.\n","\n","SEC. 5. FAILURE TO OBTAIN AUDITS WITH UNQUALIFIED OPINION OF FISCAL \n","              YEAR 2016 GENERAL FUND STATEMENT OF BUDGETARY RESOURCES \n","              OF THE DEPARTMENT OF DEFENSE.\n","\n","    (a) In General.--If the Department of Defense fails to obtain an \n","audit with an unqualified opinion on its general fund statement of \n","budgetary resources for fiscal year 2016 by December 31, 2016, the \n","following shall take effect on January 1, 2017:\n","            (1) Additional qualifications and duties of usd \n","        (comptroller).--\n","                    (A) Qualifications.--Any individual nominated for \n","                appointment to the position of Under Secretary of \n","                Defense (Comptroller) under section 135 of title 10, \n","                United States Code, shall be an individual who has \n","                served--\n","                            (i) as the chief financial officer or \n","                        equivalent position of a Federal or State \n","                        agency that has received an audit with an \n","                        unqualified opinion on such agency\'s financial \n","                        statements during the time of such individual\'s \n","                        service; or\n","                            (ii) as the chief financial officer or \n","                        equivalent position of a public company that \n","                        has received an audit with an unqualified \n","                        opinion on such company\'s financial statements \n","                        during the time of such individual\'s service.\n","                    (B) Duties and powers.--The duties and powers of \n","                the individual serving as Under Secretary of Defense \n","                (Comptroller) shall include, in addition to the duties \n","                and powers specified in section 135(c) of title 10, \n","                United States Code, such duties and powers with respect \n","                to the financial management of the Department of \n","                Defense as the Deputy Secretary of Defense (acting in \n","                the capacity of Chief Management Officer of the \n","                Department of Defense) or a successor official in the \n","                Department of Defense (acting in such capacity) may \n","                prescribe.\n","            (2) Additional qualifications and responsibilities of asa \n","        for financial management.--\n","                    (A) Qualifications.--Any individual nominated for \n","                appointment to the position of Assistant Secretary of \n","                the Army for Financial Management under section 3016 of \n","                title 10, United States Code, shall be an individual \n","                who has served--\n","                            (i) as the chief financial officer or \n","                        equivalent position of a Federal or State \n","                        agency that has received an audit with an \n","                        unqualified opinion on such agency\'s financial \n","                        statements during the time of such individual\'s \n","                        service; or\n","                            (ii) as the chief financial officer or \n","                        equivalent position of a public company that \n","                        has received an audit with an unqualified \n","                        opinion on such company\'s financial statements \n","                        during the time of such individual\'s service.\n","                    (B) Responsibilities.--The responsibilities of the \n","                individual serving as Assistant Secretary of the Army \n","                for Financial Management shall include, in addition to \n","                the responsibilities specified in section 3016(b)(4) of \n","                title 10, United States Code, such responsibilities as \n","                the Deputy Secretary of Defense (acting in the capacity \n","                of Chief Management Officer of the Department of \n","                Defense) or a successor official in the Department of \n","                Defense (acting in such capacity) may prescribe.\n","            (3) Additional qualifications and responsibilities of asn \n","        for financial management.--\n","                    (A) Qualifications.--Any individual nominated for \n","                appointment to the position of Assistant Secretary of \n","                the Navy for Financial Management under section 5016 of \n","                title 10, United States Code, shall be an individual \n","                who has served--\n","                            (i) as the chief financial officer or \n","                        equivalent position of a Federal or State \n","                        agency that has received an audit with an \n","                        unqualified opinion on such agency\'s financial \n","                        statements during the time of such individual\'s \n","                        service; or\n","                            (ii) as the chief financial officer or \n","                        equivalent position of a public company that \n","                        has received an audit with an unqualified \n","                        opinion on such company\'s financial statements \n","                        during the time of such individual\'s service.\n","                    (B) Responsibilities.--The responsibilities of the \n","                individual serving as Assistant Secretary of the Navy \n","                for Financial Management shall include, in addition to \n","                the responsibilities specified in section 5016(b)(4) of \n","                title 10, United States Code, such responsibilities as \n","                the Deputy Secretary of Defense (acting in the capacity \n","                of Chief Management Officer of the Department of \n","                Defense) or a successor official in the Department of \n","                Defense (acting in such capacity) may prescribe.\n","            (4) Additional qualifications and responsibilities of asaf \n","        for financial management.--\n","                    (A) Qualifications.--Any individual nominated for \n","                appointment to the position of Assistant Secretary of \n","                the Air Force for Financial Management under section \n","                8016 of title 10, United States Code, shall be an \n","                individual who has served--\n","                            (i) as the chief financial officer or \n","                        equivalent position of a Federal or State \n","                        agency that has received an audit with an \n","                        unqualified opinion on such agency\'s financial \n","                        statements during the time of such individual\'s \n","                        service; or\n","                            (ii) as the chief financial officer or \n","                        equivalent position of a public company that \n","                        has received an audit with an unqualified \n","                        opinion on such company\'s financial statements \n","                        during the time of such individual\'s service.\n","                    (B) Responsibilities.--The responsibilities of the \n","                individual serving as Assistant Secretary of the Air \n","                Force for Financial Management shall include, in \n","                addition to the responsibilities specified in section \n","                8016(b)(4) of title 10, United States Code, such \n","                responsibilities as the Deputy Secretary of Defense \n","                (acting in the capacity of Chief Management Officer of \n","                the Department of Defense) or a successor official in \n","                the Department of Defense (acting in such capacity) may \n","                prescribe.\n","    (b) Public Company Defined.--In this section, the term ``public \n","company\'\' has the meaning given the term ``issuer\'\' in section 2(a)(7) \n","of the Sarbanes-Oxley Act of 2002 (15 U.S.C. 7201(a)(7)).\n","\n","SEC. 6. FAILURE OF THE DEPARTMENT OF DEFENSE TO OBTAIN AUDITS WITH \n","              UNQUALIFIED OPINION OF FISCAL YEAR 2018 FINANCIAL \n","              STATEMENTS.\n","\n","    If the Department of Defense fails to obtain an audit with an \n","unqualified opinion on its general fund statement of budgetary \n","resources for fiscal year 2018 by December 31, 2018:\n","            (1) Permanent cessation of enhanced general transfer \n","        authority.--Effective as of January 1, 2019, the authority in \n","        section 4(a) shall cease to be available to the Department of \n","        Defense for fiscal year 2018 and any fiscal year thereafter.\n","            (2) Reorganization of responsibilities of chief management \n","        officer.--Effective as of April 1, 2019:\n","                    (A) Position of chief management officer.--Section \n","                132a of title 10, United States Code, is amended to \n","                read as follows:\n","``Sec. 132a. Chief Management Officer\n","    ``(a) In General.--(1) There is a Chief Management Officer of the \n","Department of Defense, appointed from civilian life by the President, \n","by and with the advice and consent of the Senate.\n","    ``(2) Any individual nominated for appointment as Chief Management \n","Officer shall be an individual who has--\n","            ``(A) extensive executive level leadership and management \n","        experience in the public or private sector;\n","            ``(B) strong leadership skills;\n","            ``(C) a demonstrated ability to manage large and complex \n","        organizations; and\n","            ``(D) a proven record in achieving positive operational \n","        results.\n","    ``(b) Powers and Duties.--The Chief Management Officer shall \n","perform such duties and exercise such powers as the Secretary of \n","Defense may prescribe.\n","    ``(c) Service as Chief Management Officer.--(1) The Chief \n","Management Officer is the Chief Management Officer of the Department of \n","Defense.\n","    ``(2) In serving as the Chief Management Officer of the Department \n","of Defense, the Chief Management Officer shall be responsible for the \n","management and administration of the Department of Defense with respect \n","to the following:\n","            ``(A) The expenditure of funds, accounting, and finance.\n","            ``(B) Procurement, including procurement of any enterprise \n","        resource planning (ERP) system and any information technology \n","        (IT) system that is a financial feeder system, human resources \n","        system, or logistics system.\n","            ``(C) Facilities, property, nonmilitary equipment, and \n","        other resources.\n","            ``(D) Strategic planning, annual performance planning, and \n","        identification and tracking of performance measures.\n","            ``(E) Internal audits and management analyses of the \n","        programs and activities of the Department, including the \n","        Defense Contract Audit Agency.\n","            ``(F) Such other areas or matters as the Secretary of \n","        Defense may designate.\n","    ``(3) The head of the Defense Contract Audit Agency shall be under \n","the supervision of, and shall report directly to, the Chief Management \n","Officer.\n","    ``(d) Precedence.--The Chief Management Officer takes precedence in \n","the Department of Defense after the Secretary of Defense and the Deputy \n","Secretary of Defense.\'\'.\n","                    (B) Conforming amendments.--\n","                            (i) Section 131(b) of title 10, United \n","                        States Code, is amended--\n","                                    (I) by striking paragraph (3);\n","                                    (II) by redesignating paragraph (2) \n","                                as paragraph (3); and\n","                                    (III) by inserting after paragraph \n","                                (1) the following new paragraph (2):\n","            ``(2) The Chief Management Officer of the Department of \n","        Defense.\'\'.\n","                            (ii) Section 132 of such title is amended--\n","                                    (I) by striking subsection (c); and\n","                                    (II) by redesignating subsections \n","                                (d) and (e) as subsections (c) and (d), \n","                                respectively.\n","                            (iii) Section 133(e)(1) of such title is \n","                        amended by striking ``and the Deputy Secretary \n","                        of Defense\'\' and inserting ``, the Deputy \n","                        Secretary of Defense, and the Chief Management \n","                        Officer of the Department of Defense\'\'.\n","                            (iv) Such title is further amended by \n","                        inserting ``the Chief Management Officer of the \n","                        Department of Defense,\'\' after ``the Deputy \n","                        Secretary of Defense,\'\' each place it appears \n","                        in the provisions as follows:\n","                                    (I) Section 133(e)(2).\n","                                    (II) Section 134(c).\n","                            (v) Section 137a(d) of such title is \n","                        amended by striking ``the Secretaries of the \n","                        military departments,\'\' and all that follows \n","                        and inserting ``the Chief Management Officer of \n","                        the Department of Defense, the Secretaries of \n","                        the military departments, and the Under \n","                        Secretaries of Defense.\'\'.\n","                            (vi) Section 138(d) of such title is \n","                        amended by striking ``the Secretaries of the \n","                        military departments,\'\' and all that follows \n","                        through the period and inserting ``the Chief \n","                        Management Officer of the Department of \n","                        Defense, the Secretaries of the military \n","                        departments, the Under Secretaries of Defense, \n","                        and the Director of Defense Research and \n","                        Engineering.\'\'.\n","                    (C) Clerical amendment.--The table of sections at \n","                the beginning of chapter 4 of such title is amended by \n","                striking the item relating to section 132a and \n","                inserting the following new item:\n","\n","``132a. Chief Management Officer.\'\'.\n","                    (D) Executive schedule.--Section 5313 of title 5, \n","                United States Code, is amended by adding at the end the \n","                following:\n","            ``Chief Management Officer of the Department of Defense.\'\'.\n","                    (E) Reference in law.--Any reference in any \n","                provision of law to the Chief Management Officer of the \n","                Department of Defense shall be deemed to refer to the \n","                Chief Management Officer of the Department of Defense \n","                under section 132a of title 10, United States Code (as \n","                amended by this paragraph).\n","            (3) Jurisdiction of dfas.--Effective as of April 1, 2019:\n","                    (A) Transfer to department of the treasury.--\n","                Jurisdiction of the Defense Finance and Accounting \n","                Service (DFAS) is transferred from the Department of \n","                Defense to the Department of the Treasury.\n","                    (B) Administration.--The Secretary of the Treasury \n","                shall administer the Defense Finance and Accounting \n","                Service following transfer under this paragraph through \n","                the Financial Management Service of the Department of \n","                the Treasury.\n","                    (C) Memorandum of understanding.--The Secretary of \n","                Defense and the Secretary of the Treasury shall jointly \n","                enter into a memorandum of understanding regarding the \n","                transfer of jurisdiction of the Defense Finance and \n","                Accounting Service under this paragraph. The memorandum \n","                of understanding shall provide for the transfer of the \n","                personnel and other resources of the Service to the \n","                Department of the Treasury and for the assumption of \n","                responsibility for such personnel and resources by the \n","                Department of the Treasury.\n","                    (D) Construction.--Nothing in this paragraph shall \n","                be construed as terminating, altering, or revising any \n","                responsibilities or authorities of the Defense Finance \n","                and Accounting Service (other than responsibilities and \n","                authorities in connection with the exercise of \n","                jurisdiction of the Service following transfer under \n","                this paragraph).\n","\n","SEC. 7. FAILURE OF THE MILITARY DEPARTMENTS TO OBTAIN AUDITS WITH \n","              UNQUALIFIED OPINION OF FINANCIAL STATEMENTS FOR FISCAL \n","              YEARS AFTER FISCAL YEAR 2017.\n","\n","    (a) Permanent Cessation of Authorities on Reprogramming of Funds.--\n","If a military department fails to obtain an audit with an unqualified \n","opinion on its financial statements for fiscal year 2018 by December \n","31, 2018, effective as of January 1, 2019, the authorities in section \n","4(b) shall cease to be available to the military department for fiscal \n","year 2018 and any fiscal year thereafter.\n","    (b) Annual Prohibition on Expenditure of Funds for Certain MDAPs \n","Past Milestone B in Connection With Failure.--\n","            (1) Prohibition.--Effective for fiscal years after fiscal \n","        year 2017, if a military department fails to obtain an audit \n","        with an unqualified opinion on its financial statements for any \n","        fiscal year, effective as of the date of the issuance of the \n","        opinion on such audit, amounts available to the military \n","        department for the following fiscal year may not be obligated \n","        by the military department for a weapon or weapon system or \n","        platform being acquired as a major defense acquisition program \n","        for any activity beyond Milestone B approval unless such \n","        program has already achieved Milestone B approval of the date \n","        of the issuance of the opinion on such audit.\n","            (2) Definitions.--In this subsection:\n","                    (A) The term ``major defense acquisition program\'\' \n","                has the meaning given that term in section 2430 of \n","                title 10, United States Code.\n","                    (B) The term ``Milestone B approval\'\' has the \n","                meaning given that term in section 2366(e)(7) of title \n","                10, United States Code.\n","\n","SEC. 8. ENTERPRISE RESOURCE PLANNING.\n","\n","    The Secretary of Defense shall amend the acquisition guidance of \n","the Department of Defense to provide for the following:\n","            (1) The Defense Business System Management Committee may \n","        not approve procurement of any Enterprise Resource Planning \n","        (ERP) business system that is independently estimated to take \n","        longer than three years to procure from initial obligation of \n","        funds to full deployment and sustainment.\n","            (2) Any contract for the acquisition of an Enterprise \n","        Resource Planning business system shall include a provision \n","        authorizing termination of the contract at no cost to the \n","        Government if procurement of the system takes longer than three \n","        years from initial obligation of funds to full deployment and \n","        sustainment.\n","            (3) Any implementation of an Enterprise Resource Planning \n","        system shall comply with each of the following:\n","                    (A) The current Business Enterprise Architecture \n","                established by the Chief Management Officer of the \n","                Department of Defense.\n","                    (B) The provisions of section 2222 of title 10, \n","                United States Code.\n","            (4) The Deputy Secretary of Defense (acting in the capacity \n","        of Chief Management Officer of the Department of Defense) or a \n","        successor official in the Department of Defense (acting in such \n","        capacity) shall have the authority to replace any program \n","        manager (whether in a military department or a Defense Agency) \n","        for the procurement of an Enterprise Resource Planning business \n","        system if procurement of the system takes longer than three \n","        years from initial obligation of funds to full deployment and \n","        sustainment.\n","            (5) Any integrator contract for the implementation of an \n","        Enterprise Resource Planning business system shall only be \n","        awarded to companies that have a history of successful \n","        implementation of other Enterprise Resource Planning business \n","        systems for the Federal Government (whether with the Department \n","        of Defense or another department or agency of the Federal \n","        Government), including meeting cost and schedule goals.\n","                                 <all>\n"}'

In [138]:
us_bills.ix[9574]


Out[138]:
bill_num                                                                           s327-114
bill_name                                 A bill to provide for auditable financial stat...
bill_text                                 {"\n","[Congressional Bills 114th Congress]\n"...
top_subject                                              Armed forces and national security
international_affairs                                                                     0
marine_and_inland_water_transportation                                                    0
Name: 9574, dtype: object

In [133]:
# first 10 false negatives (meaning they were incorrectly classified as 1-star reviews)
X_test[y_test > y_pred_class][:10]


Out[133]:
5850    {"\n","[Congressional Bills 114th Congress]\n"...
2954    {"\n","[Congressional Bills 114th Congress]\n"...
8211    {"\n","[Congressional Bills 114th Congress]\n"...
8110    {"\n","[Congressional Bills 114th Congress]\n"...
3027    {"\n","[Congressional Bills 114th Congress]\n"...
7920    {"\n","[Congressional Bills 114th Congress]\n"...
7326    {"\n","[Congressional Bills 114th Congress]\n"...
9797    {"\n","[Congressional Bills 114th Congress]\n"...
443     {"\n","[Congressional Bills 114th Congress]\n"...
6484    {"\n","[Congressional Bills 114th Congress]\n"...
Name: bill_text, dtype: object

In [136]:
X_test[2954]


Out[136]:
'{"\n","[Congressional Bills 114th Congress]\n","[From the U.S. Government Printing Office]\n","[H.R. 337 Introduced in House (IH)]\n","\n","114th CONGRESS\n","  1st Session\n","                                H. R. 337\n","\n"," To provide limitations on maritime liens on fishing permits, and for \n","                            other purposes.\n","\n","\n","_______________________________________________________________________\n","\n","\n","                    IN THE HOUSE OF REPRESENTATIVES\n","\n","                            January 13, 2015\n","\n"," Mr. Young of Alaska introduced the following bill; which was referred \n","         to the Committee on Transportation and Infrastructure\n","\n","_______________________________________________________________________\n","\n","                                 A BILL\n","\n","\n"," \n"," To provide limitations on maritime liens on fishing permits, and for \n","                            other purposes.\n","\n","    Be it enacted by the Senate and House of Representatives of the \n","United States of America in Congress assembled,\n","\n","SECTION 1. SHORT TITLE.\n","\n","    This Act may be cited as the ``Maritime Lien Reform Act\'\'.\n","\n","SEC. 2. LIMITATION ON MARITIME LIENS ON FISHING PERMIT AND PERMIT \n","              DESCRIPTION.\n","\n","    (a) In General.--Subchapter I of chapter 313 of title 46, United \n","States Code, is amended by adding at the end the following:\n","``Sec. 31310. Limitation on maritime liens on fishing permit and permit \n","              description\n","    ``(a) In General.--This chapter--\n","            ``(1) does not establish a maritime lien on a permit that--\n","                    ``(A) authorizes a person or use of a vessel to \n","                engage in fishing; and\n","                    ``(B) is issued under State or Federal law; and\n","            ``(2) does not authorize any civil action to enforce a \n","        maritime lien on such a permit.\n","    ``(b) Fishing Permit Described.--A fishing permit--\n","            ``(1) is governed solely by the State or Federal law under \n","        which it was issued; and\n","            ``(2) is not included in the whole of a vessel or as an \n","        appurtenance or intangible of a vessel for any purpose.\n","    ``(c) Limitation on Statutory Construction.--Nothing in subsections \n","(a) and (b) shall be construed as imposing any limitation upon the \n","authority of the Secretary of Commerce to modify, suspend, revoke, or \n","sanction any Federal fishery permit issued by the Secretary of Commerce \n","or to bring a civil action to enforce such modification, suspension, \n","revocation, or sanction.\'\'.\n","    (b) Clerical Amendment.--The table of sections at the beginning of \n","such chapter is amended by inserting after the item relating to section \n","31309 the following:\n","\n","``31310. Limitation on maritime liens on fishing permit and permit \n","                            description.\'\'.\n","                                 <all>\n"}'

In [137]:
us_bills.ix[2954]


Out[137]:
bill_num                                                                          hr337-114
bill_name                                 To provide limitations on maritime liens on fi...
bill_text                                 {"\n","[Congressional Bills 114th Congress]\n"...
top_subject                                                 Transportation and public works
international_affairs                                                                     0
marine_and_inland_water_transportation                                                    1
Name: 2954, dtype: object

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [144]:
# don't convert to lowercase, we end up with more features
vect = CountVectorizer(lowercase=False)
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape


Out[144]:
(8199, 72284)

In [145]:
# include 1-grams and 2-grams, we end up with many features
vect = CountVectorizer(lowercase=False, ngram_range=(1, 2),stop_words='english')
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape


Out[145]:
(8199, 1436064)

In [146]:
# create document-term matrices
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)

# use Naive Bayes to predict the star rating
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

# calculate accuracy
print metrics.accuracy_score(y_test, y_pred_class)


0.984637893197

In [147]:
# predict class probabilities
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]

In [148]:
print metrics.confusion_matrix(y_test, y_pred_class)


[[2680   19]
 [  23   12]]

In [ ]:


In [ ]: