Imports


In [1]:
import sys
import os

sys.path.append(os.path.join(os.getcwd(), os.path.pardir))
import settings

In [74]:
#from models import pos_tagger
#from models import tokenizer
reload(pos_tagger)
reload(tokenizer)


Out[74]:
<module 'models.tokenizer' from '/home/blannon/dev/fcc-net-neutrality-comments/notebooks/../models/tokenizer.py'>

In [4]:
import re

Tagging


In [5]:
test_string = "I am the greatest. I will now leave for no raisin. Who the he|| puts a pipe in a sentence? (I mean | really, it's ridiculous|behavior.)"

In [6]:
tagged = pos_tagger.tag_content(test_string)
tagged


Out[6]:
u"I|PP|i am|VBP|be the|DT|the greatest|JJS|great .|SENT|. I|PP|i will|MD|will now|RB|now leave|VV|leave for|IN|for no|DT|no raisin|NN|raisin .|SENT|. Who|WP|who the|DT|the he|||NN|he|| puts|VVZ|put a|DT|a pipe|NN|pipe in|IN|in a|DT|a sentence|NN|sentence ?|SENT|? (|(|( I|PP|i mean|VVP|mean ||VVG|<unknown> really|RB|really ,|,|, it|PP|it 's|VVZ|'s ridiculous|behavior.|JJ|ridiculous|behavior. )|)|)"

In [7]:
tagged = "7521321604.txt|JJ|7521321604.txt You|PP|you know|VVP|know FCC|NP|fcc you|PP|you 're|VVP|'re not|RB|not doing|VVG|do the|DT|the friggin|NN|friggin math|NN|math .|SENT|. AT&T|NP|at&t ,|,|, Verizon|NP|verizon and|CC|and Comcast|NP|comcast are|VBP|be all|DT|all content|NN|content providers|NNS|provider .|SENT|. If|IN|if you|PP|you let|VVP|let them|PP|them tack|VVP|tack on|IN|on surcharges|NNS|surcharge for|IN|for broadband|NN|broadband speeds|NNS|speed to|TO|to companies|NNS|company like|IN|like Netflix|NP|netflix ,|,|, then|RB|then Netflix|NP|netflix is|VBZ|be going|VVG|go to|TO|to have|VH|have to|TO|to raise|VV|raise prices|NNS|price .|SENT|. I|PP|i would|MD|would DEFINITELY|NP|definitely CANCEL|NP|cancel MY|NP|my NETFLIX|NP|netflix ACCOUNT|NN|account !|SENT|! And|CC|and then|RB|then the|DT|the oligopoly|NN|oligopoly for|IN|for AT&T|NP|at&t ,|,|, Verizon|NP|verizon and|CC|and Comcast|NP|comcast will|MD|will get|VV|get more|RBR|more profitable|JJ|profitable and|CC|and dominant|JJ|dominant .|SENT|. Not|RB|not to|TO|to mention|VV|mention Comcast|NP|comcast is|VBZ|be already|RB|already trying|VVG|try to|TO|to merge|VV|merge with|IN|with Time|NP|time Warner|NP|warner .|SENT|. IF|IN|if YOU|PP|you RUIN|NP|ruin NET|JJ|net NEUTRALITY|NP|neutrality YOU|PP|you ARE|VBP|be GOING|VVG|go TO|TO|to FUCK|NP|fuck UP|IN|up MY|NP|my ENTERTAINMENT|NP|entertainment CHOICES|NNS|choice AND|CC|and I|PP|i ALSO|NP|also WORK|VVP|work ON|IN|on THE|DT|the INTERNET|NP|internet FOR|IN|for A|DT|a LIVING|NN|live .|SENT|. YOU|PP|you COULD|NP|could FUCK|NP|fuck UP|IN|up MY|NP|my ABILITY|NP|ability TO|TO|to WORK|VVP|work AND|CC|and BE|VB|be PROFITABLE|NP|profitable .|SENT|. PULL|NP|pull YOUR|JJ|your FUCKING|NP|fucking HEAD|NP|head OUT|IN|out OF|IN|of YOUR|JJ|your FUCKING|NP|fucking SELFISH|NP|selfish CRIMINALLY|NP|criminally MOTIVATED|VVD|motivate FAT|NP|fat ASS|NP|ass YOU|PP|you FUCKS!!|NP|fucks!! !|SENT|! Page|NP|page 1|CD|1"

In [8]:
tagged.split()


Out[8]:
['7521321604.txt|JJ|7521321604.txt',
 'You|PP|you',
 'know|VVP|know',
 'FCC|NP|fcc',
 'you|PP|you',
 "'re|VVP|'re",
 'not|RB|not',
 'doing|VVG|do',
 'the|DT|the',
 'friggin|NN|friggin',
 'math|NN|math',
 '.|SENT|.',
 'AT&T|NP|at&t',
 ',|,|,',
 'Verizon|NP|verizon',
 'and|CC|and',
 'Comcast|NP|comcast',
 'are|VBP|be',
 'all|DT|all',
 'content|NN|content',
 'providers|NNS|provider',
 '.|SENT|.',
 'If|IN|if',
 'you|PP|you',
 'let|VVP|let',
 'them|PP|them',
 'tack|VVP|tack',
 'on|IN|on',
 'surcharges|NNS|surcharge',
 'for|IN|for',
 'broadband|NN|broadband',
 'speeds|NNS|speed',
 'to|TO|to',
 'companies|NNS|company',
 'like|IN|like',
 'Netflix|NP|netflix',
 ',|,|,',
 'then|RB|then',
 'Netflix|NP|netflix',
 'is|VBZ|be',
 'going|VVG|go',
 'to|TO|to',
 'have|VH|have',
 'to|TO|to',
 'raise|VV|raise',
 'prices|NNS|price',
 '.|SENT|.',
 'I|PP|i',
 'would|MD|would',
 'DEFINITELY|NP|definitely',
 'CANCEL|NP|cancel',
 'MY|NP|my',
 'NETFLIX|NP|netflix',
 'ACCOUNT|NN|account',
 '!|SENT|!',
 'And|CC|and',
 'then|RB|then',
 'the|DT|the',
 'oligopoly|NN|oligopoly',
 'for|IN|for',
 'AT&T|NP|at&t',
 ',|,|,',
 'Verizon|NP|verizon',
 'and|CC|and',
 'Comcast|NP|comcast',
 'will|MD|will',
 'get|VV|get',
 'more|RBR|more',
 'profitable|JJ|profitable',
 'and|CC|and',
 'dominant|JJ|dominant',
 '.|SENT|.',
 'Not|RB|not',
 'to|TO|to',
 'mention|VV|mention',
 'Comcast|NP|comcast',
 'is|VBZ|be',
 'already|RB|already',
 'trying|VVG|try',
 'to|TO|to',
 'merge|VV|merge',
 'with|IN|with',
 'Time|NP|time',
 'Warner|NP|warner',
 '.|SENT|.',
 'IF|IN|if',
 'YOU|PP|you',
 'RUIN|NP|ruin',
 'NET|JJ|net',
 'NEUTRALITY|NP|neutrality',
 'YOU|PP|you',
 'ARE|VBP|be',
 'GOING|VVG|go',
 'TO|TO|to',
 'FUCK|NP|fuck',
 'UP|IN|up',
 'MY|NP|my',
 'ENTERTAINMENT|NP|entertainment',
 'CHOICES|NNS|choice',
 'AND|CC|and',
 'I|PP|i',
 'ALSO|NP|also',
 'WORK|VVP|work',
 'ON|IN|on',
 'THE|DT|the',
 'INTERNET|NP|internet',
 'FOR|IN|for',
 'A|DT|a',
 'LIVING|NN|live',
 '.|SENT|.',
 'YOU|PP|you',
 'COULD|NP|could',
 'FUCK|NP|fuck',
 'UP|IN|up',
 'MY|NP|my',
 'ABILITY|NP|ability',
 'TO|TO|to',
 'WORK|VVP|work',
 'AND|CC|and',
 'BE|VB|be',
 'PROFITABLE|NP|profitable',
 '.|SENT|.',
 'PULL|NP|pull',
 'YOUR|JJ|your',
 'FUCKING|NP|fucking',
 'HEAD|NP|head',
 'OUT|IN|out',
 'OF|IN|of',
 'YOUR|JJ|your',
 'FUCKING|NP|fucking',
 'SELFISH|NP|selfish',
 'CRIMINALLY|NP|criminally',
 'MOTIVATED|VVD|motivate',
 'FAT|NP|fat',
 'ASS|NP|ass',
 'YOU|PP|you',
 'FUCKS!!|NP|fucks!!',
 '!|SENT|!',
 'Page|NP|page',
 '1|CD|1']

regex for tags


In [38]:
#regex = re.compile(r'(.+?)\|([A-Z\.\,:\)\(\$)\"]+)\|(.+)')
regex = re.compile(r'(.+?)\|([A-Z\.\,:\)\(\$)\"#`\']+)\|(.+)')

In [39]:
for s in tagged.split():
    word, pos, lemma = regex.findall(s)[0]
    print '{s}_{pos}'.format(s=lemma, pos=pos)


7521321604.txt_JJ
you_PP
know_VVP
fcc_NP
you_PP
're_VVP
not_RB
do_VVG
the_DT
friggin_NN
math_NN
._SENT
at&t_NP
,_,
verizon_NP
and_CC
comcast_NP
be_VBP
all_DT
content_NN
provider_NNS
._SENT
if_IN
you_PP
let_VVP
them_PP
tack_VVP
on_IN
surcharge_NNS
for_IN
broadband_NN
speed_NNS
to_TO
company_NNS
like_IN
netflix_NP
,_,
then_RB
netflix_NP
be_VBZ
go_VVG
to_TO
have_VH
to_TO
raise_VV
price_NNS
._SENT
i_PP
would_MD
definitely_NP
cancel_NP
my_NP
netflix_NP
account_NN
!_SENT
and_CC
then_RB
the_DT
oligopoly_NN
for_IN
at&t_NP
,_,
verizon_NP
and_CC
comcast_NP
will_MD
get_VV
more_RBR
profitable_JJ
and_CC
dominant_JJ
._SENT
not_RB
to_TO
mention_VV
comcast_NP
be_VBZ
already_RB
try_VVG
to_TO
merge_VV
with_IN
time_NP
warner_NP
._SENT
if_IN
you_PP
ruin_NP
net_JJ
neutrality_NP
you_PP
be_VBP
go_VVG
to_TO
fuck_NP
up_IN
my_NP
entertainment_NP
choice_NNS
and_CC
i_PP
also_NP
work_VVP
on_IN
the_DT
internet_NP
for_IN
a_DT
live_NN
._SENT
you_PP
could_NP
fuck_NP
up_IN
my_NP
ability_NP
to_TO
work_VVP
and_CC
be_VB
profitable_NP
._SENT
pull_NP
your_JJ
fucking_NP
head_NP
out_IN
of_IN
your_JJ
fucking_NP
selfish_NP
criminally_NP
motivate_VVD
fat_NP
ass_NP
you_PP
fucks!!_NP
!_SENT
page_NP
1_CD

In [11]:
punct_tags = list(',.:)')

test tokenizer


In [12]:
reload(pos_tagger)
reload(tokenizer)


Out[12]:
<module 'models.tokenizer' from '/home/blannon/dev/fcc-net-neutrality-comments/notebooks/../models/tokenizer.pyc'>

In [75]:
pt_tokenizer = tokenizer.PretaggedTokenizer(stopword_list=None, filter_tags=punct_tags)

In [14]:
pt_tokenizer.tokenize(tagged)


Out[14]:
['know',
 'fcc',
 'friggin',
 'math',
 'verizon',
 'comcast',
 'content',
 'provider',
 'let',
 'tack',
 'surcharge',
 'broadband',
 'speed',
 'company',
 'like',
 'netflix',
 'netflix',
 'go',
 'raise',
 'price',
 'would',
 'definitely',
 'cancel',
 'netflix',
 'account',
 'oligopoly',
 'verizon',
 'comcast',
 'get',
 'profitable',
 'dominant',
 'mention',
 'comcast',
 'already',
 'try',
 'merge',
 'time',
 'warner',
 'ruin',
 'net',
 'neutrality',
 'go',
 'fuck',
 'entertainment',
 'choice',
 'also',
 'work',
 'internet',
 'live',
 'could',
 'fuck',
 'ability',
 'work',
 'profitable',
 'pull',
 'fucking',
 'head',
 'fucking',
 'selfish',
 'criminally',
 'motivate',
 'fat',
 'ass',
 'page']

In [15]:
import json

In [24]:
debug_target = '../data/json/processed/6018149583-1.json'

with open(debug_target) as fin:
    jd = json.load(fin)
    tokens = pt_tokenizer.tokenize(jd['tagged'])

In [26]:
jd['tagged']


Out[26]:
u'Chairman|NP|chairman Tom|NP|tom Wheeler|NP|wheeler Federal|NP|federal Communications|NP|communication Commission|NP|commission 445|CD|445 12th|NP|12th Street|NP|street ,|,|, SW|NP|sw Washington|NP|washington ,|,|, DC|NN|dc 20554|CD|20554 Re|NP|re :|:|: WC|NP|wc Docket|NP|docket No.|NN|no. 14-28|CD|14-28 Comments|NP|comment Dear|NP|dear Chairman|NP|chairman Wheeler|NP|wheeler and|CC|and the|DT|the Commissioners|NPS|commissioner of|IN|of the|DT|the FCC|NP|fcc ,|,|, Enclosed|VVN|enclose are|VBP|be 117|CD|117 ,|,|, 460|CD|460 individual|JJ|individual public|JJ|public comments|NNS|comment collected|VVN|collect by|IN|by CREDO|NP|credo Action|NP|action for|IN|for submission|NN|submission to|TO|to the|DT|the open|JJ|open Internet|NP|internet docket|NN|docket .|SENT|. Individual|JJ|individual comments|NNS|comment may|MD|may differ|VV|differ throughout|IN|throughout ,|,|, although|IN|although the|DT|the majority|NN|majority read|NN|read as|IN|as follows|VVZ|follow :|:|: "|"|" As|IN|as an|DT|a Internet|NP|internet user|NN|user who|WP|who believes|VVZ|believe strongly|RB|strongly in|IN|in the|DT|the importance|NN|importance of|IN|of a|DT|a free|JJ|free and|CC|and open|JJ|open Internet|NP|internet ,|,|, I|PP|i urge|VVP|urge the|DT|the FCC|NP|fcc to|TO|to reclassify|VV|reclassify broadband|NP|broadband Internet|NP|internet access|NN|access as|IN|as a|DT|a telecommunications|NNS|telecommunication service|VVP|service ,|,|, and|CC|and save|VV|save Net|NP|net Neutrality|NP|neutrality .|SENT|. In|IN|in addition|NN|addition ,|,|, the|DT|the FCC|NP|fcc should|MD|should reject|VV|reject the|DT|the proposed|VVN|propose rules|NNS|rule that|WDT|that would|MD|would allow|VV|allow Internet|NP|internet service|NN|service providers|NNS|provider to|TO|to divide|VV|divide the|DT|the Internet|NP|internet into|IN|into fast|JJ|fast lanes|NNS|lane for|IN|for wealthy|JJ|wealthy corporations|NNS|corporation and|CC|and slow|JJ|slow lanes|NNS|lane for|IN|for the|DT|the rest|NN|rest of|IN|of us|PP|us .|SENT|. "|"|" The|DT|the comments|NNS|comment have|VHP|have been|VBN|be split|NN|split into|IN|into 24|CD|24 documents|NNS|document in|IN|in order|NN|order to|TO|to facilitate|VV|facilitate uploading|VVG|upload to|TO|to the|DT|the FCC|NP|fcc website|NN|website .|SENT|. This|DT|this is|VBZ|be the|DT|the first|JJ|first of|IN|of the|DT|the four|CD|four submissions|NNS|submission .|SENT|. Please|UH|please contact|NN|contact me|PP|me if|IN|if you|PP|you have|VHP|have any|DT|any questions|NNS|question about|IN|about the|DT|the attached|VVN|attach comment|NN|comment submissions|NNS|submission .|SENT|. Sincerely|RB|sincerely ,|,|, Becky|NP|becky Bond|NP|bond Political|NP|political Director|NP|director ,|,|, CREDO|NP|credo Action|NP|action bbond@credoaction.com|JJ|bbond@credoaction.com 415-369-2000|CD|415-369-2000 \x0c|NN|\x0c'

In [29]:
tokens


Out[29]:
[u'chairman',
 u'tom',
 u'wheeler',
 u'federal',
 u'communication',
 u'commission',
 u'street',
 u'sw',
 u'washington',
 u'dc',
 u're',
 u'wc',
 u'docket',
 u'comment',
 u'dear',
 u'chairman',
 u'wheeler',
 u'commissioner',
 u'fcc',
 u'enclose',
 u'individual',
 u'public',
 u'comment',
 u'collect',
 u'credo',
 u'action',
 u'submission',
 u'open',
 u'internet',
 u'docket',
 u'individual',
 u'comment',
 u'may',
 u'differ',
 u'throughout',
 u'although',
 u'majority',
 u'read',
 u'follow',
 u'internet',
 u'user',
 u'believe',
 u'strongly',
 u'importance',
 u'free',
 u'open',
 u'internet',
 u'urge',
 u'fcc',
 u'reclassify',
 u'broadband',
 u'internet',
 u'access',
 u'telecommunication',
 u'service',
 u'save',
 u'net',
 u'neutrality',
 u'addition',
 u'fcc',
 u'reject',
 u'propose',
 u'rule',
 u'would',
 u'allow',
 u'internet',
 u'service',
 u'provider',
 u'divide',
 u'internet',
 u'fast',
 u'lane',
 u'wealthy',
 u'corporation',
 u'slow',
 u'lane',
 u'rest',
 u'us',
 u'comment',
 u'split',
 u'document',
 u'order',
 u'facilitate',
 u'upload',
 u'fcc',
 u'website',
 u'first',
 u'four',
 u'submission',
 u'please',
 u'contact',
 u'question',
 u'attach',
 u'comment',
 u'submission',
 u'sincerely',
 u'becky',
 u'bond',
 u'political',
 u'director',
 u'credo',
 u'action']

In [30]:
jd


Out[30]:
{u'applicant': u'Becky Bond',
 u'brief': False,
 u'city': u'San Francisco',
 u'dateRcpt': u'2014-07-16T04:00:00Z',
 u'disseminated': u'2014-07-17T13:06:23.37Z',
 u'exParte': False,
 u'id': u'6018149583-1',
 u'modified': u'2014-07-18T12:32:46.94Z',
 u'pages': 5490,
 u'preprocessed': True,
 u'proceeding': u'14-28',
 u'regFlexAnalysis': False,
 u'smallBusinessImpact': False,
 u'stateCd': u'CA',
 u'submissionType': u'COMMENT',
 u'tagged': u'Chairman|NP|chairman Tom|NP|tom Wheeler|NP|wheeler Federal|NP|federal Communications|NP|communication Commission|NP|commission 445|CD|445 12th|NP|12th Street|NP|street ,|,|, SW|NP|sw Washington|NP|washington ,|,|, DC|NN|dc 20554|CD|20554 Re|NP|re :|:|: WC|NP|wc Docket|NP|docket No.|NN|no. 14-28|CD|14-28 Comments|NP|comment Dear|NP|dear Chairman|NP|chairman Wheeler|NP|wheeler and|CC|and the|DT|the Commissioners|NPS|commissioner of|IN|of the|DT|the FCC|NP|fcc ,|,|, Enclosed|VVN|enclose are|VBP|be 117|CD|117 ,|,|, 460|CD|460 individual|JJ|individual public|JJ|public comments|NNS|comment collected|VVN|collect by|IN|by CREDO|NP|credo Action|NP|action for|IN|for submission|NN|submission to|TO|to the|DT|the open|JJ|open Internet|NP|internet docket|NN|docket .|SENT|. Individual|JJ|individual comments|NNS|comment may|MD|may differ|VV|differ throughout|IN|throughout ,|,|, although|IN|although the|DT|the majority|NN|majority read|NN|read as|IN|as follows|VVZ|follow :|:|: "|"|" As|IN|as an|DT|a Internet|NP|internet user|NN|user who|WP|who believes|VVZ|believe strongly|RB|strongly in|IN|in the|DT|the importance|NN|importance of|IN|of a|DT|a free|JJ|free and|CC|and open|JJ|open Internet|NP|internet ,|,|, I|PP|i urge|VVP|urge the|DT|the FCC|NP|fcc to|TO|to reclassify|VV|reclassify broadband|NP|broadband Internet|NP|internet access|NN|access as|IN|as a|DT|a telecommunications|NNS|telecommunication service|VVP|service ,|,|, and|CC|and save|VV|save Net|NP|net Neutrality|NP|neutrality .|SENT|. In|IN|in addition|NN|addition ,|,|, the|DT|the FCC|NP|fcc should|MD|should reject|VV|reject the|DT|the proposed|VVN|propose rules|NNS|rule that|WDT|that would|MD|would allow|VV|allow Internet|NP|internet service|NN|service providers|NNS|provider to|TO|to divide|VV|divide the|DT|the Internet|NP|internet into|IN|into fast|JJ|fast lanes|NNS|lane for|IN|for wealthy|JJ|wealthy corporations|NNS|corporation and|CC|and slow|JJ|slow lanes|NNS|lane for|IN|for the|DT|the rest|NN|rest of|IN|of us|PP|us .|SENT|. "|"|" The|DT|the comments|NNS|comment have|VHP|have been|VBN|be split|NN|split into|IN|into 24|CD|24 documents|NNS|document in|IN|in order|NN|order to|TO|to facilitate|VV|facilitate uploading|VVG|upload to|TO|to the|DT|the FCC|NP|fcc website|NN|website .|SENT|. This|DT|this is|VBZ|be the|DT|the first|JJ|first of|IN|of the|DT|the four|CD|four submissions|NNS|submission .|SENT|. Please|UH|please contact|NN|contact me|PP|me if|IN|if you|PP|you have|VHP|have any|DT|any questions|NNS|question about|IN|about the|DT|the attached|VVN|attach comment|NN|comment submissions|NNS|submission .|SENT|. Sincerely|RB|sincerely ,|,|, Becky|NP|becky Bond|NP|bond Political|NP|political Director|NP|director ,|,|, CREDO|NP|credo Action|NP|action bbond@credoaction.com|JJ|bbond@credoaction.com 415-369-2000|CD|415-369-2000 \x0c|NN|\x0c',
 u'text': u'Chairman Tom Wheeler\nFederal Communications Commission\n445 12th Street, SW\nWashington, DC 20554\nRe: WC Docket No. 14-28 Comments\nDear Chairman Wheeler and the Commissioners of the FCC,\nEnclosed are 117,460 individual public comments collected by CREDO Action for\nsubmission to the open Internet docket. Individual comments may differ throughout,\nalthough the majority read as follows:\n\u201cAs an Internet user who believes strongly in the importance of a free and open\nInternet, I urge the FCC to reclassify broadband Internet access as a\ntelecommunications service, and save Net Neutrality.\nIn addition, the FCC should reject the proposed rules that would allow Internet\nservice providers to divide the Internet into fast lanes for wealthy corporations and\nslow lanes for the rest of us.\u201d\nThe comments have been split into 24 documents in order to facilitate uploading to\nthe FCC website. This is the first of the four submissions.\nPlease contact me if you have any questions about the attached comment\nsubmissions.\nSincerely,\nBecky Bond\nPolitical Director, CREDO Action\nbbond@credoaction.com\n415-369-2000\n\t\n\n\x0c',
 u'viewingStatus': u'Unrestricted',
 u'zip': u'94105'}

In [31]:
u'\u2028' in jd['tagged']


Out[31]:
False

In [35]:
stripped = jd['tagged'].replace(u'\x0c',' ')

In [41]:
[regex.findall(s) for s in stripped.split(' ')]


Out[41]:
[[(u'Chairman', u'NP', u'chairman')],
 [(u'Tom', u'NP', u'tom')],
 [(u'Wheeler', u'NP', u'wheeler')],
 [(u'Federal', u'NP', u'federal')],
 [(u'Communications', u'NP', u'communication')],
 [(u'Commission', u'NP', u'commission')],
 [(u'445', u'CD', u'445')],
 [(u'12th', u'NP', u'12th')],
 [(u'Street', u'NP', u'street')],
 [(u',', u',', u',')],
 [(u'SW', u'NP', u'sw')],
 [(u'Washington', u'NP', u'washington')],
 [(u',', u',', u',')],
 [(u'DC', u'NN', u'dc')],
 [(u'20554', u'CD', u'20554')],
 [(u'Re', u'NP', u're')],
 [(u':', u':', u':')],
 [(u'WC', u'NP', u'wc')],
 [(u'Docket', u'NP', u'docket')],
 [(u'No.', u'NN', u'no.')],
 [(u'14-28', u'CD', u'14-28')],
 [(u'Comments', u'NP', u'comment')],
 [(u'Dear', u'NP', u'dear')],
 [(u'Chairman', u'NP', u'chairman')],
 [(u'Wheeler', u'NP', u'wheeler')],
 [(u'and', u'CC', u'and')],
 [(u'the', u'DT', u'the')],
 [(u'Commissioners', u'NPS', u'commissioner')],
 [(u'of', u'IN', u'of')],
 [(u'the', u'DT', u'the')],
 [(u'FCC', u'NP', u'fcc')],
 [(u',', u',', u',')],
 [(u'Enclosed', u'VVN', u'enclose')],
 [(u'are', u'VBP', u'be')],
 [(u'117', u'CD', u'117')],
 [(u',', u',', u',')],
 [(u'460', u'CD', u'460')],
 [(u'individual', u'JJ', u'individual')],
 [(u'public', u'JJ', u'public')],
 [(u'comments', u'NNS', u'comment')],
 [(u'collected', u'VVN', u'collect')],
 [(u'by', u'IN', u'by')],
 [(u'CREDO', u'NP', u'credo')],
 [(u'Action', u'NP', u'action')],
 [(u'for', u'IN', u'for')],
 [(u'submission', u'NN', u'submission')],
 [(u'to', u'TO', u'to')],
 [(u'the', u'DT', u'the')],
 [(u'open', u'JJ', u'open')],
 [(u'Internet', u'NP', u'internet')],
 [(u'docket', u'NN', u'docket')],
 [(u'.', u'SENT', u'.')],
 [(u'Individual', u'JJ', u'individual')],
 [(u'comments', u'NNS', u'comment')],
 [(u'may', u'MD', u'may')],
 [(u'differ', u'VV', u'differ')],
 [(u'throughout', u'IN', u'throughout')],
 [(u',', u',', u',')],
 [(u'although', u'IN', u'although')],
 [(u'the', u'DT', u'the')],
 [(u'majority', u'NN', u'majority')],
 [(u'read', u'NN', u'read')],
 [(u'as', u'IN', u'as')],
 [(u'follows', u'VVZ', u'follow')],
 [(u':', u':', u':')],
 [(u'"', u'"', u'"')],
 [(u'As', u'IN', u'as')],
 [(u'an', u'DT', u'a')],
 [(u'Internet', u'NP', u'internet')],
 [(u'user', u'NN', u'user')],
 [(u'who', u'WP', u'who')],
 [(u'believes', u'VVZ', u'believe')],
 [(u'strongly', u'RB', u'strongly')],
 [(u'in', u'IN', u'in')],
 [(u'the', u'DT', u'the')],
 [(u'importance', u'NN', u'importance')],
 [(u'of', u'IN', u'of')],
 [(u'a', u'DT', u'a')],
 [(u'free', u'JJ', u'free')],
 [(u'and', u'CC', u'and')],
 [(u'open', u'JJ', u'open')],
 [(u'Internet', u'NP', u'internet')],
 [(u',', u',', u',')],
 [(u'I', u'PP', u'i')],
 [(u'urge', u'VVP', u'urge')],
 [(u'the', u'DT', u'the')],
 [(u'FCC', u'NP', u'fcc')],
 [(u'to', u'TO', u'to')],
 [(u'reclassify', u'VV', u'reclassify')],
 [(u'broadband', u'NP', u'broadband')],
 [(u'Internet', u'NP', u'internet')],
 [(u'access', u'NN', u'access')],
 [(u'as', u'IN', u'as')],
 [(u'a', u'DT', u'a')],
 [(u'telecommunications', u'NNS', u'telecommunication')],
 [(u'service', u'VVP', u'service')],
 [(u',', u',', u',')],
 [(u'and', u'CC', u'and')],
 [(u'save', u'VV', u'save')],
 [(u'Net', u'NP', u'net')],
 [(u'Neutrality', u'NP', u'neutrality')],
 [(u'.', u'SENT', u'.')],
 [(u'In', u'IN', u'in')],
 [(u'addition', u'NN', u'addition')],
 [(u',', u',', u',')],
 [(u'the', u'DT', u'the')],
 [(u'FCC', u'NP', u'fcc')],
 [(u'should', u'MD', u'should')],
 [(u'reject', u'VV', u'reject')],
 [(u'the', u'DT', u'the')],
 [(u'proposed', u'VVN', u'propose')],
 [(u'rules', u'NNS', u'rule')],
 [(u'that', u'WDT', u'that')],
 [(u'would', u'MD', u'would')],
 [(u'allow', u'VV', u'allow')],
 [(u'Internet', u'NP', u'internet')],
 [(u'service', u'NN', u'service')],
 [(u'providers', u'NNS', u'provider')],
 [(u'to', u'TO', u'to')],
 [(u'divide', u'VV', u'divide')],
 [(u'the', u'DT', u'the')],
 [(u'Internet', u'NP', u'internet')],
 [(u'into', u'IN', u'into')],
 [(u'fast', u'JJ', u'fast')],
 [(u'lanes', u'NNS', u'lane')],
 [(u'for', u'IN', u'for')],
 [(u'wealthy', u'JJ', u'wealthy')],
 [(u'corporations', u'NNS', u'corporation')],
 [(u'and', u'CC', u'and')],
 [(u'slow', u'JJ', u'slow')],
 [(u'lanes', u'NNS', u'lane')],
 [(u'for', u'IN', u'for')],
 [(u'the', u'DT', u'the')],
 [(u'rest', u'NN', u'rest')],
 [(u'of', u'IN', u'of')],
 [(u'us', u'PP', u'us')],
 [(u'.', u'SENT', u'.')],
 [(u'"', u'"', u'"')],
 [(u'The', u'DT', u'the')],
 [(u'comments', u'NNS', u'comment')],
 [(u'have', u'VHP', u'have')],
 [(u'been', u'VBN', u'be')],
 [(u'split', u'NN', u'split')],
 [(u'into', u'IN', u'into')],
 [(u'24', u'CD', u'24')],
 [(u'documents', u'NNS', u'document')],
 [(u'in', u'IN', u'in')],
 [(u'order', u'NN', u'order')],
 [(u'to', u'TO', u'to')],
 [(u'facilitate', u'VV', u'facilitate')],
 [(u'uploading', u'VVG', u'upload')],
 [(u'to', u'TO', u'to')],
 [(u'the', u'DT', u'the')],
 [(u'FCC', u'NP', u'fcc')],
 [(u'website', u'NN', u'website')],
 [(u'.', u'SENT', u'.')],
 [(u'This', u'DT', u'this')],
 [(u'is', u'VBZ', u'be')],
 [(u'the', u'DT', u'the')],
 [(u'first', u'JJ', u'first')],
 [(u'of', u'IN', u'of')],
 [(u'the', u'DT', u'the')],
 [(u'four', u'CD', u'four')],
 [(u'submissions', u'NNS', u'submission')],
 [(u'.', u'SENT', u'.')],
 [(u'Please', u'UH', u'please')],
 [(u'contact', u'NN', u'contact')],
 [(u'me', u'PP', u'me')],
 [(u'if', u'IN', u'if')],
 [(u'you', u'PP', u'you')],
 [(u'have', u'VHP', u'have')],
 [(u'any', u'DT', u'any')],
 [(u'questions', u'NNS', u'question')],
 [(u'about', u'IN', u'about')],
 [(u'the', u'DT', u'the')],
 [(u'attached', u'VVN', u'attach')],
 [(u'comment', u'NN', u'comment')],
 [(u'submissions', u'NNS', u'submission')],
 [(u'.', u'SENT', u'.')],
 [(u'Sincerely', u'RB', u'sincerely')],
 [(u',', u',', u',')],
 [(u'Becky', u'NP', u'becky')],
 [(u'Bond', u'NP', u'bond')],
 [(u'Political', u'NP', u'political')],
 [(u'Director', u'NP', u'director')],
 [(u',', u',', u',')],
 [(u'CREDO', u'NP', u'credo')],
 [(u'Action', u'NP', u'action')],
 [(u'bbond@credoaction.com', u'JJ', u'bbond@credoaction.com')],
 [(u'415-369-2000', u'CD', u'415-369-2000')],
 [],
 [],
 []]

testing corpus


In [42]:
from models import util
from models import corpus

In [62]:
reload(util)


Out[62]:
<module 'models.util' from '/home/blannon/dev/fcc-net-neutrality-comments/notebooks/../models/util.py'>

In [43]:
lj_corpus = corpus.LazyJSONCorpus(tokenizer=pt_tokenizer, dictionary=None, path_to_text="tagged")

In [44]:
lj_corpus.documents = [debug_target,]

In [47]:
lj_corpus.extract_doctext(debug_target)


Out[47]:
u'Chairman|NP|chairman Tom|NP|tom Wheeler|NP|wheeler Federal|NP|federal Communications|NP|communication Commission|NP|commission 445|CD|445 12th|NP|12th Street|NP|street ,|,|, SW|NP|sw Washington|NP|washington ,|,|, DC|NN|dc 20554|CD|20554 Re|NP|re :|:|: WC|NP|wc Docket|NP|docket No.|NN|no. 14-28|CD|14-28 Comments|NP|comment Dear|NP|dear Chairman|NP|chairman Wheeler|NP|wheeler and|CC|and the|DT|the Commissioners|NPS|commissioner of|IN|of the|DT|the FCC|NP|fcc ,|,|, Enclosed|VVN|enclose are|VBP|be 117|CD|117 ,|,|, 460|CD|460 individual|JJ|individual public|JJ|public comments|NNS|comment collected|VVN|collect by|IN|by CREDO|NP|credo Action|NP|action for|IN|for submission|NN|submission to|TO|to the|DT|the open|JJ|open Internet|NP|internet docket|NN|docket .|SENT|. Individual|JJ|individual comments|NNS|comment may|MD|may differ|VV|differ throughout|IN|throughout ,|,|, although|IN|although the|DT|the majority|NN|majority read|NN|read as|IN|as follows|VVZ|follow :|:|: "|"|" As|IN|as an|DT|a Internet|NP|internet user|NN|user who|WP|who believes|VVZ|believe strongly|RB|strongly in|IN|in the|DT|the importance|NN|importance of|IN|of a|DT|a free|JJ|free and|CC|and open|JJ|open Internet|NP|internet ,|,|, I|PP|i urge|VVP|urge the|DT|the FCC|NP|fcc to|TO|to reclassify|VV|reclassify broadband|NP|broadband Internet|NP|internet access|NN|access as|IN|as a|DT|a service|VVP|service ,|,|, and|CC|and save|VV|save Net|NP|net Neutrality|NP|neutrality .|SENT|. In|IN|in addition|NN|addition ,|,|, the|DT|the FCC|NP|fcc should|MD|should reject|VV|reject the|DT|the proposed|VVN|propose rules|NNS|rule that|WDT|that would|MD|would allow|VV|allow Internet|NP|internet service|NN|service providers|NNS|provider to|TO|to divide|VV|divide the|DT|the Internet|NP|internet into|IN|into fast|JJ|fast lanes|NNS|lane for|IN|for wealthy|JJ|wealthy corporations|NNS|corporation and|CC|and slow|JJ|slow lanes|NNS|lane for|IN|for the|DT|the rest|NN|rest of|IN|of us|PP|us .|SENT|. "|"|" The|DT|the comments|NNS|comment have|VHP|have been|VBN|be split|NN|split into|IN|into 24|CD|24 documents|NNS|document in|IN|in order|NN|order to|TO|to facilitate|VV|facilitate uploading|VVG|upload to|TO|to the|DT|the FCC|NP|fcc website|NN|website .|SENT|. This|DT|this is|VBZ|be the|DT|the first|JJ|first of|IN|of the|DT|the four|CD|four submissions|NNS|submission .|SENT|. Please|UH|please contact|NN|contact me|PP|me if|IN|if you|PP|you have|VHP|have any|DT|any questions|NNS|question about|IN|about the|DT|the attached|VVN|attach comment|NN|comment submissions|NNS|submission .|SENT|. Sincerely|RB|sincerely ,|,|, Becky|NP|becky Bond|NP|bond Political|NP|political Director|NP|director ,|,|, CREDO|NP|credo Action|NP|action 415-369-2000|CD|415-369-2000 |NN|\x0c'

In [48]:
with open(debug_target, 'r') as file_in:
    text = reduce(dict.get, "tagged".split('.'), json.load(file_in))

In [63]:
text


Out[63]:
u'Chairman|NP|chairman Tom|NP|tom Wheeler|NP|wheeler Federal|NP|federal Communications|NP|communication Commission|NP|commission 445|CD|445 12th|NP|12th Street|NP|street ,|,|, SW|NP|sw Washington|NP|washington ,|,|, DC|NN|dc 20554|CD|20554 Re|NP|re :|:|: WC|NP|wc Docket|NP|docket No.|NN|no. 14-28|CD|14-28 Comments|NP|comment Dear|NP|dear Chairman|NP|chairman Wheeler|NP|wheeler and|CC|and the|DT|the Commissioners|NPS|commissioner of|IN|of the|DT|the FCC|NP|fcc ,|,|, Enclosed|VVN|enclose are|VBP|be 117|CD|117 ,|,|, 460|CD|460 individual|JJ|individual public|JJ|public comments|NNS|comment collected|VVN|collect by|IN|by CREDO|NP|credo Action|NP|action for|IN|for submission|NN|submission to|TO|to the|DT|the open|JJ|open Internet|NP|internet docket|NN|docket .|SENT|. Individual|JJ|individual comments|NNS|comment may|MD|may differ|VV|differ throughout|IN|throughout ,|,|, although|IN|although the|DT|the majority|NN|majority read|NN|read as|IN|as follows|VVZ|follow :|:|: "|"|" As|IN|as an|DT|a Internet|NP|internet user|NN|user who|WP|who believes|VVZ|believe strongly|RB|strongly in|IN|in the|DT|the importance|NN|importance of|IN|of a|DT|a free|JJ|free and|CC|and open|JJ|open Internet|NP|internet ,|,|, I|PP|i urge|VVP|urge the|DT|the FCC|NP|fcc to|TO|to reclassify|VV|reclassify broadband|NP|broadband Internet|NP|internet access|NN|access as|IN|as a|DT|a telecommunications|NNS|telecommunication service|VVP|service ,|,|, and|CC|and save|VV|save Net|NP|net Neutrality|NP|neutrality .|SENT|. In|IN|in addition|NN|addition ,|,|, the|DT|the FCC|NP|fcc should|MD|should reject|VV|reject the|DT|the proposed|VVN|propose rules|NNS|rule that|WDT|that would|MD|would allow|VV|allow Internet|NP|internet service|NN|service providers|NNS|provider to|TO|to divide|VV|divide the|DT|the Internet|NP|internet into|IN|into fast|JJ|fast lanes|NNS|lane for|IN|for wealthy|JJ|wealthy corporations|NNS|corporation and|CC|and slow|JJ|slow lanes|NNS|lane for|IN|for the|DT|the rest|NN|rest of|IN|of us|PP|us .|SENT|. "|"|" The|DT|the comments|NNS|comment have|VHP|have been|VBN|be split|NN|split into|IN|into 24|CD|24 documents|NNS|document in|IN|in order|NN|order to|TO|to facilitate|VV|facilitate uploading|VVG|upload to|TO|to the|DT|the FCC|NP|fcc website|NN|website .|SENT|. This|DT|this is|VBZ|be the|DT|the first|JJ|first of|IN|of the|DT|the four|CD|four submissions|NNS|submission .|SENT|. Please|UH|please contact|NN|contact me|PP|me if|IN|if you|PP|you have|VHP|have any|DT|any questions|NNS|question about|IN|about the|DT|the attached|VVN|attach comment|NN|comment submissions|NNS|submission .|SENT|. Sincerely|RB|sincerely ,|,|, Becky|NP|becky Bond|NP|bond Political|NP|political Director|NP|director ,|,|, CREDO|NP|credo Action|NP|action bbond@credoaction.com|JJ|bbond@credoaction.com 415-369-2000|CD|415-369-2000 \x0c|NN|\x0c'

In [64]:
util.clean_text(text)


Out[64]:
u'Chairman|NP|chairman Tom|NP|tom Wheeler|NP|wheeler Federal|NP|federal Communications|NP|communication Commission|NP|commission 445|CD|445 12th|NP|12th Street|NP|street ,|,|, SW|NP|sw Washington|NP|washington ,|,|, DC|NN|dc 20554|CD|20554 Re|NP|re :|:|: WC|NP|wc Docket|NP|docket No.|NN|no. 14-28|CD|14-28 Comments|NP|comment Dear|NP|dear Chairman|NP|chairman Wheeler|NP|wheeler and|CC|and the|DT|the Commissioners|NPS|commissioner of|IN|of the|DT|the FCC|NP|fcc ,|,|, Enclosed|VVN|enclose are|VBP|be 117|CD|117 ,|,|, 460|CD|460 individual|JJ|individual public|JJ|public comments|NNS|comment collected|VVN|collect by|IN|by CREDO|NP|credo Action|NP|action for|IN|for submission|NN|submission to|TO|to the|DT|the open|JJ|open Internet|NP|internet docket|NN|docket .|SENT|. Individual|JJ|individual comments|NNS|comment may|MD|may differ|VV|differ throughout|IN|throughout ,|,|, although|IN|although the|DT|the majority|NN|majority read|NN|read as|IN|as follows|VVZ|follow :|:|: "|"|" As|IN|as an|DT|a Internet|NP|internet user|NN|user who|WP|who believes|VVZ|believe strongly|RB|strongly in|IN|in the|DT|the importance|NN|importance of|IN|of a|DT|a free|JJ|free and|CC|and open|JJ|open Internet|NP|internet ,|,|, I|PP|i urge|VVP|urge the|DT|the FCC|NP|fcc to|TO|to reclassify|VV|reclassify broadband|NP|broadband Internet|NP|internet access|NN|access as|IN|as a|DT|a service|VVP|service ,|,|, and|CC|and save|VV|save Net|NP|net Neutrality|NP|neutrality .|SENT|. In|IN|in addition|NN|addition ,|,|, the|DT|the FCC|NP|fcc should|MD|should reject|VV|reject the|DT|the proposed|VVN|propose rules|NNS|rule that|WDT|that would|MD|would allow|VV|allow Internet|NP|internet service|NN|service providers|NNS|provider to|TO|to divide|VV|divide the|DT|the Internet|NP|internet into|IN|into fast|JJ|fast lanes|NNS|lane for|IN|for wealthy|JJ|wealthy corporations|NNS|corporation and|CC|and slow|JJ|slow lanes|NNS|lane for|IN|for the|DT|the rest|NN|rest of|IN|of us|PP|us .|SENT|. "|"|" The|DT|the comments|NNS|comment have|VHP|have been|VBN|be split|NN|split into|IN|into 24|CD|24 documents|NNS|document in|IN|in order|NN|order to|TO|to facilitate|VV|facilitate uploading|VVG|upload to|TO|to the|DT|the FCC|NP|fcc website|NN|website .|SENT|. This|DT|this is|VBZ|be the|DT|the first|JJ|first of|IN|of the|DT|the four|CD|four submissions|NNS|submission .|SENT|. Please|UH|please contact|NN|contact me|PP|me if|IN|if you|PP|you have|VHP|have any|DT|any questions|NNS|question about|IN|about the|DT|the attached|VVN|attach comment|NN|comment submissions|NNS|submission .|SENT|. Sincerely|RB|sincerely ,|,|, Becky|NP|becky Bond|NP|bond Political|NP|political Director|NP|director ,|,|, CREDO|NP|credo Action|NP|action 415-369-2000|CD|415-369-2000 |NN| '

In [66]:
cleaned_text = reduce(lambda t, p: t.replace(*p), util.map_chars.items(), text)

In [76]:
pt_tokenizer.tokenize(cleaned_text)


Out[76]:
[u'chairman',
 u'tom',
 u'wheeler',
 u'federal',
 u'communication',
 u'commission',
 u'street',
 u'sw',
 u'washington',
 u'dc',
 u're',
 u'wc',
 u'docket',
 u'comment',
 u'dear',
 u'chairman',
 u'wheeler',
 u'commissioner',
 u'fcc',
 u'enclose',
 u'individual',
 u'public',
 u'comment',
 u'collect',
 u'credo',
 u'action',
 u'submission',
 u'open',
 u'internet',
 u'docket',
 u'individual',
 u'comment',
 u'may',
 u'differ',
 u'throughout',
 u'although',
 u'majority',
 u'read',
 u'follow',
 u'internet',
 u'user',
 u'believe',
 u'strongly',
 u'importance',
 u'free',
 u'open',
 u'internet',
 u'urge',
 u'fcc',
 u'reclassify',
 u'broadband',
 u'internet',
 u'access',
 u'telecommunication',
 u'service',
 u'save',
 u'net',
 u'neutrality',
 u'addition',
 u'fcc',
 u'reject',
 u'propose',
 u'rule',
 u'would',
 u'allow',
 u'internet',
 u'service',
 u'provider',
 u'divide',
 u'internet',
 u'fast',
 u'lane',
 u'wealthy',
 u'corporation',
 u'slow',
 u'lane',
 u'rest',
 u'us',
 u'comment',
 u'split',
 u'document',
 u'order',
 u'facilitate',
 u'upload',
 u'fcc',
 u'website',
 u'first',
 u'four',
 u'submission',
 u'please',
 u'contact',
 u'question',
 u'attach',
 u'comment',
 u'submission',
 u'sincerely',
 u'becky',
 u'bond',
 u'political',
 u'director',
 u'credo',
 u'action']

In [ ]: