In [1]:
%matplotlib inline
In [2]:
import nltk
from nltk.chunk import ne_chunk
from nltk.corpus.reader import ChunkedCorpusReader
from nltk.corpus import stopwords
In [50]:
#chunk_dir = nltk.data.find('corpora/ncbi_corpus_chunked_single')
#chunk = ChunkedCorpusReader(chunk_dir, '.*\.txt')
In [3]:
tagged_dir = nltk.data.find('corpora/ncbi_corpus_chunked_single')
tagged = nltk.corpus.reader.TaggedCorpusReader(tagged_dir, '.*\.txt')
In [13]:
print tagged.tagged_words()[0:20]
In [20]:
words = [w[0].lower() for w in tagged.tagged_words()]
In [29]:
tags = [w[1] for w in tagged.tagged_words()]
In [33]:
unique_tags = []
for tag in tags:
if 'B-' in tag:
unique_tags.append(tag)
In [40]:
print words[0:5]
print tags[0:5]
In [6]:
words_stop = [w for w in words if w not in stopwords.words('english')]
In [7]:
fd = nltk.FreqDist(words_stop)
fd.plot(30, cumulative=False)
In [ ]:
tagged.