notebook.community

Edit and run



In [1]:

    
%matplotlib inline



In [2]:

    
import nltk
from nltk.chunk import ne_chunk
from nltk.corpus.reader import ChunkedCorpusReader
from nltk.corpus import stopwords



In [50]:

    
#chunk_dir = nltk.data.find('corpora/ncbi_corpus_chunked_single')
#chunk = ChunkedCorpusReader(chunk_dir, '.*\.txt')



In [3]:

    
tagged_dir = nltk.data.find('corpora/ncbi_corpus_chunked_single')
tagged = nltk.corpus.reader.TaggedCorpusReader(tagged_dir, '.*\.txt')



In [13]:

    
print tagged.tagged_words()[0:20]









    



[(u'A', u'O'), (u'common', u'O'), (u'human', u'O'), (u'skin', u'B-D012878'), (u'tumour', u'I-D012878'), (u'is', u'O'), (u'caused', u'O'), (u'by', u'O'), (u'activating', u'O'), (u'mutations', u'O'), (u'in', u'O'), (u'beta-catenin', u'O'), (u'WNT', u'O'), (u'signalling', u'O'), (u'orchestrates', u'O'), (u'a', u'O'), (u'number', u'O'), (u'of', u'O'), (u'developmental', u'O'), (u'programs', u'O')]



In [20]:

    
words = [w[0].lower() for w in tagged.tagged_words()]



In [29]:

    
tags = [w[1] for w in tagged.tagged_words()]



In [33]:

    
unique_tags = []

for tag in tags:
    if 'B-' in tag:
        unique_tags.append(tag)



In [40]:

    
print words[0:5]
print tags[0:5]









    



[u'a', u'common', u'human', u'skin', u'tumour']
[u'O', u'O', u'O', u'B-D012878', u'I-D012878']



In [6]:

    
words_stop = [w for w in words if w not in stopwords.words('english')]



In [7]:

    
fd = nltk.FreqDist(words_stop)
fd.plot(30, cumulative=False)



In [ ]:

    
tagged.