In [1]:
# basic NLP
import nltk, codecs, string, random, math, cPickle as pickle, re, datetime
from collections import Counter
from string import punctuation

# scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.metrics.pairwise import linear_kernel

# plotting
%matplotlib inline
from matplotlib import pyplot as plt

from __future__ import division

sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
stopset = set(nltk.corpus.stopwords.words('english'))

In [3]:
%load_ext sql
%sql mysql://
%config SqlMagic.feedback = False

In [4]:
def chunk_phrases(tagged_text, grammar):
    '''
    tagged_text: list of list of 3-tuples, returned by our Stanford tagger procedure
    grammar: raw text representing regular expression chunks
    '''
    # set up noun phrase grammar
    cp = nltk.RegexpParser(grammar)
    
    # initialize new list of sentences
    new_sents = []
    for sent in tagged_text:
        # tag punctuation to remove it from chunker patterns
        clean_sent = [(w, t if w[0] not in punctuation else 'XX') for w, t, l in sent]
        new_sents.append([t if type(t) is tuple else (" ".join([a for (a,b) in t.leaves()]), t.node) for t in cp.parse(clean_sent)])
    
    return new_sents

In [5]:
def print_sent(chunked):
    for sent in chunked:
        new_sent = ''
        for w, t in sent:
            if t[:2] == 'NP':
                new_sent += '<<%s>> ' % w.upper()
            else:
                new_sent += w + ' '
        print new_sent
        print

In [17]:
grammar = r"""
              NP1: {(<NN.*><POS>)?<RB>?(<JJ.*>|<VBN>)*<NN.*>+}
              NP2: {(<JJ.*>|<VB.*>)<XX>}
           """

In [14]:
condition = 'Sclerosis'
tagcrit = %sql select tagged_text from criteria_tagged join criteria_text using (criteria_text_id) join condition_browse using (nct_id) where mesh_term = :condition

In [18]:
print_sent(chunk_phrases(eval(tagcrit[4000][0]), grammar))


<<KARNOFSKY PERFORMANCE STATUS>> -LRB- <<PS>> -RRB- 60-100 % -LRB- for <<PATIENTS>> > 16 <<YEARS>> of <<AGE>> -RRB- <<OR LANKSY PS>> 60-100 % -LRB- for <<PATIENTS>> = < 16 <<YEARS>> of <<AGE>> -RRB- 


In [ ]:


In [ ]: