In [1]:
# basic NLP
import nltk, codecs, string, random, math, cPickle as pickle, re, datetime
from collections import Counter
from string import punctuation
# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics.pairwise import linear_kernel
# plotting
%matplotlib inline
from matplotlib import pyplot as plt
from __future__ import division
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
stopset = set(nltk.corpus.stopwords.words('english'))
In [3]:
%load_ext sql
%sql mysql://
%config SqlMagic.feedback = False
In [4]:
def chunk_phrases(tagged_text, grammar):
'''
tagged_text: list of list of 3-tuples, returned by our Stanford tagger procedure
grammar: raw text representing regular expression chunks
'''
# set up noun phrase grammar
cp = nltk.RegexpParser(grammar)
# initialize new list of sentences
new_sents = []
for sent in tagged_text:
# tag punctuation to remove it from chunker patterns
clean_sent = [(w, t if w[0] not in punctuation else 'XX') for w, t, l in sent]
new_sents.append([t if type(t) is tuple else (" ".join([a for (a,b) in t.leaves()]), t.node) for t in cp.parse(clean_sent)])
return new_sents
In [5]:
def print_sent(chunked):
for sent in chunked:
new_sent = ''
for w, t in sent:
if t[:2] == 'NP':
new_sent += '<<%s>> ' % w.upper()
else:
new_sent += w + ' '
print new_sent
print
In [17]:
grammar = r"""
NP1: {(<NN.*><POS>)?<RB>?(<JJ.*>|<VBN>)*<NN.*>+}
NP2: {(<JJ.*>|<VB.*>)<XX>}
"""
In [14]:
condition = 'Sclerosis'
tagcrit = %sql select tagged_text from criteria_tagged join criteria_text using (criteria_text_id) join condition_browse using (nct_id) where mesh_term = :condition
In [18]:
print_sent(chunk_phrases(eval(tagcrit[4000][0]), grammar))
In [ ]:
In [ ]: