In [8]:
from bs4 import BeautifulSoup
import httplib, codecs, datetime
import cPickle as pickle

In [2]:
#change mx300m to Xmx3g to handle memory demand

# java -Xmx3g -cp stanford-postagger-3.5.1.jar edu.stanford.nlp.tagger.maxent.MaxentTaggerServer -model models/english-bidirectional-distsim.tagger -outputFormat xml -outputFormatOptions lemmatize -port 2020

In [2]:
server = httplib.HTTPConnection('127.0.0.1:2020')

In [3]:
criteria = codecs.open('data/stanford_sentence_list.csv','r').readlines()

In [12]:
#start = datetime.datetime.now()
def stan_tag(criteria):
    tagged = []
    for ix, c in enumerate(criteria[:10000]):
        # initialize list of sentences
        sents = []

        # send text to server
        server.request('', c)
        res = BeautifulSoup(server.getresponse().read())

        # loop through sentences to generate lists of tagged/lemmatized tuples
        for sentence in res.findAll('sentence'):
            sent_tag = []
            for word in sentence.findAll('word'):
                sent_tag.append((word.get_text(), word['pos'], word['lemma']))
            sents.append(sent_tag)

        # add sentence to tagged list
        tagged.append(sents)
        
        #save every 100,000 lines
        if ix%100000 == 0:
            print 'Line: ', ix
            pickle.dump(tagged, open('data/stanford_tagged_criteria.pkl', 'wb'))
    pickle.dump(tagged, open('data/stanford_tagged_criteria.pkl', 'wb'))
    print 'Complete'

#print datetime.datetime.now() - start

In [13]:
stan_tag(criteria)


Line:  0
Line:  1000
Line:  2000
Line:  3000
Line:  4000
Line:  5000
Line:  6000
Line:  7000
Line:  8000
Line:  9000
Complete

In [6]:
criteria[100106]


Out[6]:
'Must be willing to use dual method of contraception (i.e., barrier and spermicide; birth control pills and barrier) during the study.\n'

In [7]:
tagged[106]


Out[7]:
[[(u'Must', u'MD', u'must'),
  (u'be', u'VB', u'be'),
  (u'willing', u'JJ', u'willing'),
  (u'to', u'TO', u'to'),
  (u'use', u'VB', u'use'),
  (u'dual', u'JJ', u'dual'),
  (u'method', u'NN', u'method'),
  (u'of', u'IN', u'of'),
  (u'contraception', u'NN', u'contraception'),
  (u'-LRB-', u'-LRB-', u'-lrb-'),
  (u'i.e.', u'FW', u'i.e.'),
  (u',', u',', u','),
  (u'barrier', u'NN', u'barrier'),
  (u'and', u'CC', u'and'),
  (u'spermicide', u'NN', u'spermicide'),
  (u';', u':', u';'),
  (u'birth', u'NN', u'birth'),
  (u'control', u'NN', u'control'),
  (u'pills', u'NNS', u'pill'),
  (u'and', u'CC', u'and'),
  (u'barrier', u'NN', u'barrier'),
  (u'-RRB-', u'-RRB-', u'-rrb-'),
  (u'during', u'IN', u'during'),
  (u'the', u'DT', u'the'),
  (u'study', u'NN', u'study'),
  (u'.', u'.', u'.')]]

In [10]:
var = str(tagged[106])

In [11]:
eval(var)


Out[11]:
[[(u'Must', u'MD', u'must'),
  (u'be', u'VB', u'be'),
  (u'willing', u'JJ', u'willing'),
  (u'to', u'TO', u'to'),
  (u'use', u'VB', u'use'),
  (u'dual', u'JJ', u'dual'),
  (u'method', u'NN', u'method'),
  (u'of', u'IN', u'of'),
  (u'contraception', u'NN', u'contraception'),
  (u'-LRB-', u'-LRB-', u'-lrb-'),
  (u'i.e.', u'FW', u'i.e.'),
  (u',', u',', u','),
  (u'barrier', u'NN', u'barrier'),
  (u'and', u'CC', u'and'),
  (u'spermicide', u'NN', u'spermicide'),
  (u';', u':', u';'),
  (u'birth', u'NN', u'birth'),
  (u'control', u'NN', u'control'),
  (u'pills', u'NNS', u'pill'),
  (u'and', u'CC', u'and'),
  (u'barrier', u'NN', u'barrier'),
  (u'-RRB-', u'-RRB-', u'-rrb-'),
  (u'during', u'IN', u'during'),
  (u'the', u'DT', u'the'),
  (u'study', u'NN', u'study'),
  (u'.', u'.', u'.')]]

In [ ]: