In [8]:
from bs4 import BeautifulSoup
import httplib, codecs, datetime
import cPickle as pickle
In [2]:
#change mx300m to Xmx3g to handle memory demand
# java -Xmx3g -cp stanford-postagger-3.5.1.jar edu.stanford.nlp.tagger.maxent.MaxentTaggerServer -model models/english-bidirectional-distsim.tagger -outputFormat xml -outputFormatOptions lemmatize -port 2020
In [2]:
server = httplib.HTTPConnection('127.0.0.1:2020')
In [3]:
criteria = codecs.open('data/stanford_sentence_list.csv','r').readlines()
In [12]:
#start = datetime.datetime.now()
def stan_tag(criteria):
tagged = []
for ix, c in enumerate(criteria[:10000]):
# initialize list of sentences
sents = []
# send text to server
server.request('', c)
res = BeautifulSoup(server.getresponse().read())
# loop through sentences to generate lists of tagged/lemmatized tuples
for sentence in res.findAll('sentence'):
sent_tag = []
for word in sentence.findAll('word'):
sent_tag.append((word.get_text(), word['pos'], word['lemma']))
sents.append(sent_tag)
# add sentence to tagged list
tagged.append(sents)
#save every 100,000 lines
if ix%100000 == 0:
print 'Line: ', ix
pickle.dump(tagged, open('data/stanford_tagged_criteria.pkl', 'wb'))
pickle.dump(tagged, open('data/stanford_tagged_criteria.pkl', 'wb'))
print 'Complete'
#print datetime.datetime.now() - start
In [13]:
stan_tag(criteria)
In [6]:
criteria[100106]
Out[6]:
In [7]:
tagged[106]
Out[7]:
In [10]:
var = str(tagged[106])
In [11]:
eval(var)
Out[11]:
In [ ]: